diff --git a/README.md b/README.md index 937fd2a..f83a717 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,12 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj * Hungarian: * ISO-8859-2 * WINDOWS-1250 + * Italian + * ISO-8859-1 + * ISO-8859-3 + * ISO-8859-9 + * ISO-8859-15 + * WINDOWS-1252 * Japanese * ISO-2022-JP * SHIFT_JIS diff --git a/script/BuildLangModelLogs/LangItalianModel.log b/script/BuildLangModelLogs/LangItalianModel.log new file mode 100644 index 0000000..6b539c0 --- /dev/null +++ b/script/BuildLangModelLogs/LangItalianModel.log @@ -0,0 +1,162 @@ += Logs of language model for Italian (it) = + +- Generated by BuildLangModel.py +- Started: 2016-09-21 18:43:12.831409 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Pieve Ligure (revision 83186252) +010 (prefisso) (revision 76157203) +1000 (revision 83185341) +1143 (revision 70627567) +1162 (revision 70627612) +118 - Emergenza sanitaria (revision 83267411) +1201 (revision 77523243) +1202 (revision 76764411) +1374 (revision 78259457) +1404 (revision 70628069) +1520 (revision 76854924) +1537 (revision 70628296) +1582 (revision 80626188) +1584 (revision 76837051) +1600 (revision 76869356) +1619 (revision 70628455) +1742 (revision 70628675) +1748 (revision 70628682) +1749 (revision 70628684) +1750 (revision 70628690) +1754 (revision 70628697) +1775 (revision 70628734) +1797 (revision 78338823) +1798 (revision 82047236) +1803 (revision 77502534) +1805 (revision 79369853) +1809 (revision 70628789) +1810 (revision 82930218) +1814 (revision 78338825) +1815 (revision 82669615) +1816 (revision 83185384) +1818 (revision 72407239) +1823 (revision 74880156) +1859 (revision 83185401) +1860 (revision 83185403) +1861 (revision 83185412) +1868 (revision 83185430) +1874 (revision 83185441) +1897 (revision 83185267) +1908 (revision 83185631) +1909 (revision 83185630) +1913 (revision 83185626) +1915 (revision 83185625) +1917 (revision 83185270) +1920 (revision 83185621) +1921 (revision 83185619) +1923 (revision 83185616) +1925 (revision 83185614) +1926 (revision 83185612) +1928 (revision 83185610) +1929 (revision 83185609) +1939 (revision 83185598) +1946 (revision 83185590) +1947 (revision 83185589) +1948 (revision 83185587) +1951 (revision 83185584) +1956 (revision 83185478) +1960 (revision 83185487) +1964 (revision 83185493) +1965 (revision 83185494) +1969 (revision 83185500) +1970 (revision 83185503) +1971 (revision 83185505) +1975 (revision 83185510) +1976 (revision 83185513) +1977 (revision 83185514) +1980 (revision 83185518) +1981 (revision 83308867) +1983 (revision 83185524) +1985 (revision 83185526) +1988 (revision 83185280) +1990 (revision 83185531) +1995 (revision 83185538) +1999 (revision 83326325) +2000 (revision 83185544) +2001 (revision 83309058) +2002 (revision 83185545) +2003 (revision 83185546) +2004 (revision 83185283) +2005 (revision 83185285) +2006 (revision 83185547) +2007 (revision 83185549) +2008 (revision 83185551) +2009 (revision 83185552) +2010 (revision 83185287) +2012 (revision 83185289) +712 (revision 70630167) +749 (revision 78272323) +ATP (Provincia di Genova) (revision 82754117) +Abbazia di San Colombano (revision 83062997) +Abbazia di San Fruttuoso (revision 83288120) +Acacia dealbata (revision 83036867) +Acquedotto (revision 82973825) +Affresco (revision 82000422) +Agricoltura (revision 82578266) +Allevamento (revision 82971452) +Altitudine (revision 82971213) +Angelo (revision 82333116) +Anni 1960 (revision 83161222) +Anni 1970 (revision 81663175) +Antica Roma (revision 83125874) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-21 18:46:08.840718 + +59 characters appeared 823241 times. + +First 34 characters: +[ 0] Char i: 11.823147778111148 % +[ 1] Char a: 11.252112078965942 % +[ 2] Char e: 10.910170897707962 % +[ 3] Char o: 8.936386793174782 % +[ 4] Char n: 7.317055394471364 % +[ 5] Char l: 6.931263141655967 % +[ 6] Char r: 6.521784021932824 % +[ 7] Char t: 6.386708145002497 % +[ 8] Char s: 4.572415610981475 % +[ 9] Char c: 4.116291584116923 % +[10] Char d: 3.9770856893667834 % +[11] Char u: 2.8944136650142545 % +[12] Char m: 2.762860450342002 % +[13] Char p: 2.6809889206198427 % +[14] Char g: 2.1493098618751985 % +[15] Char v: 1.5369739845318686 % +[16] Char b: 1.2855287819727153 % +[17] Char f: 0.9932692856648295 % +[18] Char z: 0.9664241698360504 % +[19] Char h: 0.7159507361756764 % +[20] Char q: 0.2416060424590126 % +[21] Char k: 0.18876610858788617 % +[22] Char à: 0.15596890825408355 % +[23] Char y: 0.12462936126844994 % +[24] Char è: 0.11600491229178332 % +[25] Char w: 0.10628722330398996 % +[26] Char x: 0.10312897438295712 % +[27] Char j: 0.07555503188009344 % +[28] Char ù: 0.05575524056746445 % +[29] Char ò: 0.03304014255849745 % +[30] Char é: 0.021014502436103158 % +[31] Char ì: 0.0191924357508919 % +[32] Char á: 0.004737373381549267 % +[33] Char ó: 0.003644133370422513 % + +The first 34 characters have an accumulated ratio of 0.9997947138201325. + +872 sequences found. + +First 512 (typical positive ratio): 0.9989484485502651 +Next 512 (512-1024): 1.214711123474171e-06 +Rest: -4.336808689942018e-17 + +- Processing end: 2016-09-21 18:46:08.920456 diff --git a/script/langs/it.py b/script/langs/it.py new file mode 100644 index 0000000..aebf37f --- /dev/null +++ b/script/langs/it.py @@ -0,0 +1,56 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Italian' +code = 'it' +use_ascii = True +charsets = ['ISO-8859-1', 'ISO-8859-3', 'ISO-8859-9', + 'ISO-8859-15', 'WINDOWS-1252'] + +## Optional Properties ## + +alphabet = 'óéèò' +start_pages = ['Pieve Ligure'] +wikipedia_code = code +case_mapping = True diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c01126c..f859915 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -18,6 +18,7 @@ set( LangModels/LangGreekModel.cpp LangModels/LangHungarianModel.cpp LangModels/LangHebrewModel.cpp + LangModels/LangItalianModel.cpp LangModels/LangLithuanianModel.cpp LangModels/LangLatvianModel.cpp LangModels/LangMalteseModel.cpp diff --git a/src/LangModels/LangItalianModel.cpp b/src/LangModels/LangItalianModel.cpp new file mode 100644 index 0000000..0a9565c --- /dev/null +++ b/src/LangModels/LangItalianModel.cpp @@ -0,0 +1,264 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Italian *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-21 18:46:08.841217 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_3_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 4X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 6X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 59,SYM,SYM,SYM,ILL, 60,SYM,SYM, 61, 48, 47, 62,SYM,ILL, 58, /* AX */ + SYM, 63,SYM,SYM,SYM,SYM, 64,SYM,SYM, 46, 48, 47, 65,SYM,ILL, 58, /* BX */ + 22, 32, 50,ILL, 39, 66, 67, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* CX */ + ILL, 44, 29, 33, 51, 68, 34,SYM, 69, 28, 45, 70, 36, 71, 72, 73, /* DX */ + 22, 32, 50,ILL, 39, 74, 75, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* EX */ + ILL, 44, 29, 33, 51, 76, 34,SYM, 77, 28, 45, 78, 36, 79, 80,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_15_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 4X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 6X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM, 35,SYM, 35,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM, 41, 81,SYM,SYM, 41,SYM,SYM,SYM, 52, 52, 82,SYM, /* BX */ + 22, 32, 50, 43, 39, 53, 54, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* CX */ + 56, 44, 29, 33, 51, 83, 34,SYM, 57, 28, 45, 84, 36, 85, 86, 87, /* DX */ + 22, 32, 50, 43, 39, 53, 54, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* EX */ + 56, 44, 29, 33, 51, 88, 34,SYM, 57, 28, 45, 89, 36, 90, 91, 92, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_9_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 4X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 6X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 93,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 22, 32, 50, 43, 39, 53, 54, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* CX */ + 47, 44, 29, 33, 51, 94, 34,SYM, 57, 28, 45, 95, 36, 96, 48, 97, /* DX */ + 22, 32, 50, 43, 39, 53, 54, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* EX */ + 47, 44, 29, 33, 51, 98, 34,SYM, 57, 28, 45, 99, 36, 46, 48,100, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_1_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 4X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 6X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,101,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 22, 32, 50, 43, 39, 53, 54, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* CX */ + 56, 44, 29, 33, 51,102, 34,SYM, 57, 28, 45,103, 36,104,105,106, /* DX */ + 22, 32, 50, 43, 39, 53, 54, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* EX */ + 56, 44, 29, 33, 51,107, 34,SYM, 57, 28, 45,108, 36,109,110,111, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1252_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 4X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 6X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,112,SYM,SYM,SYM,SYM,SYM,SYM, 35,SYM, 52,ILL, 41,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 35,SYM, 52,ILL, 41,113, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,114,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 22, 32, 50, 43, 39, 53, 54, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* CX */ + 56, 44, 29, 33, 51,115, 34,SYM, 57, 28, 45,116, 36,117,118,119, /* DX */ + 22, 32, 50, 43, 39, 53, 54, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* EX */ + 56, 44, 29, 33, 51,120, 34,SYM, 57, 28, 45,121, 36,122,123,124, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 872 + * First 512 sequences: 0.9989484485502651 + * Next 512 sequences (512-1024): 0.0010515514497349433 + * Rest: -4.336808689942018e-17 + * Negative sequences: TODO + */ +static const PRUint8 ItalianLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,3,3,3,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,3,3,3,0,2,0,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,2,3,2,3,0,3,3,2,2,0, + 3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,2,3,2,0,3,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,3,0,2,3,3,2,3,2,2,3,3,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,0,0,3,2,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,3,3,0,3,0,0,3,2,0,3,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,0,2,3,3,2,3,2,3,2,2,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,0,3,0,3,2,3,3,3,0,3,2,3,0,0, + 3,3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,3,0,0,2,0,0,0,3,0,2,3,0,0,3,2,2,2,2, + 3,3,3,3,2,3,3,3,3,3,3,3,3,3,2,2,2,2,2,3,0,3,2,3,0,2,0,2,0,3,2,0,2,2, + 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,3,2,0,3,2,2,0,3,0,2,2,2,0,2,2,0,0,2, + 3,3,3,3,2,3,3,0,2,2,2,3,2,2,2,3,2,0,0,2,0,2,2,3,2,0,0,0,0,2,2,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,0,2,3,0,2,0,3,0,3,0,2,2,2,2,3,2,0, + 3,3,3,3,0,3,3,3,2,3,0,3,2,2,3,2,2,3,0,2,0,2,0,0,2,2,2,2,2,0,2,0,0,0, + 3,3,3,3,3,2,2,2,2,0,2,3,0,2,3,0,3,2,3,3,0,3,0,3,0,2,0,2,0,3,2,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,0,2,0,2,0,3,0,3,0,3,0,2,0,0,3,0,3,0, + 2,3,0,2,0,0,2,0,2,0,0,3,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,2,2,3,3,2,2,2,2,2,0,3,0,3,0,3,0,2,2,2,0,0,0,0,2,2, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,0,0,0,2,0,2,0,2,2,2,0,0,0,0,0,0, + 2,0,0,0,2,0,3,0,2,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,2,3,2,0,2,0,2,0,0,2,2,0,3,0,0,0,2,0,3,0,0,0,0,0,0,0,0, + 3,3,3,3,0,3,0,3,2,3,0,2,0,3,0,3,0,0,0,0,0,2,0,2,0,2,3,0,0,0,0,0,0,0, + 3,3,3,3,2,2,2,2,0,2,2,3,2,0,0,0,0,0,0,2,0,3,0,2,0,2,0,2,0,0,0,0,0,2, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,2,3,2,2,3,3,2,3,2,3,0,2,2,0,2,3,0,2,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0, + 0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,0,3,2,2,0,2,2,0,0,0,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,0,3,2,2,0,0,2,2,0,0,0,0,2,2,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_3ItalianModel = +{ + Iso_8859_3_CharToOrderMap, + ItalianLangModel, + 34, + (float)0.9989484485502651, + PR_TRUE, + "ISO-8859-3" +}; + +const SequenceModel Iso_8859_15ItalianModel = +{ + Iso_8859_15_CharToOrderMap, + ItalianLangModel, + 34, + (float)0.9989484485502651, + PR_TRUE, + "ISO-8859-15" +}; + +const SequenceModel Iso_8859_9ItalianModel = +{ + Iso_8859_9_CharToOrderMap, + ItalianLangModel, + 34, + (float)0.9989484485502651, + PR_TRUE, + "ISO-8859-9" +}; + +const SequenceModel Iso_8859_1ItalianModel = +{ + Iso_8859_1_CharToOrderMap, + ItalianLangModel, + 34, + (float)0.9989484485502651, + PR_TRUE, + "ISO-8859-1" +}; + +const SequenceModel Windows_1252ItalianModel = +{ + Windows_1252_CharToOrderMap, + ItalianLangModel, + 34, + (float)0.9989484485502651, + PR_TRUE, + "WINDOWS-1252" +}; diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index e26540c..1530e46 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -150,6 +150,12 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[64] = new nsSingleByteCharSetProber(&Iso_8859_15FinnishModel); mProbers[65] = new nsSingleByteCharSetProber(&Windows_1252FinnishModel); + mProbers[65] = new nsSingleByteCharSetProber(&Iso_8859_1ItalianModel); + mProbers[66] = new nsSingleByteCharSetProber(&Iso_8859_3ItalianModel); + mProbers[67] = new nsSingleByteCharSetProber(&Iso_8859_9ItalianModel); + mProbers[69] = new nsSingleByteCharSetProber(&Iso_8859_15ItalianModel); + mProbers[70] = new nsSingleByteCharSetProber(&Windows_1252ItalianModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index aeeac65..7f80b63 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 66 +#define NUM_OF_SBCS_PROBERS 71 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 2905c05..7472054 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -211,5 +211,11 @@ extern const SequenceModel Iso_8859_13FinnishModel; extern const SequenceModel Iso_8859_15FinnishModel; extern const SequenceModel Windows_1252FinnishModel; +extern const SequenceModel Iso_8859_1ItalianModel; +extern const SequenceModel Iso_8859_3ItalianModel; +extern const SequenceModel Iso_8859_9ItalianModel; +extern const SequenceModel Iso_8859_15ItalianModel; +extern const SequenceModel Windows_1252ItalianModel; + #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/test/it/iso-8859-1.txt b/test/it/iso-8859-1.txt new file mode 100644 index 0000000..0afbc36 --- /dev/null +++ b/test/it/iso-8859-1.txt @@ -0,0 +1,18 @@ +L'architettura longobarda è costituita dall'insieme delle opere architettoniche +realizzate in Italia durante il regno dei Longobardi (568-774), con residuale +permanenza nell'Italia meridionale fino al X-XI secolo (Langobardia Minor), e +commissionate dai re e dai duchi longobardi. +L'attività architettonica sviluppata in Langobardia Maior è andata in gran parte +perduta, per lo più a causa di successive ricostruzioni degli edifici sacri e +profani eretti tra VII e VIII secolo. A parte il Tempietto longobardo di +Cividale del Friuli, rimasto in gran parte intatto, gli edifici civili e +religiosi di Pavia, Monza o altre località sono stati ampiamente rimaneggiati +nei secoli seguenti. Ancora integre rimangono così soltanto poche architetture, +o perché inglobate negli ampliamenti successivi - come la chiesa di San +Salvatore a Brescia) -, o perché periferiche e di modeste dimensioni - come la +chiesa di Santa Maria foris portas a Castelseprio. Testimonianze maggiormente +fedeli alla forma originale si ritrovano, invece, nella Langobardia Minor: a +Benevento si conservano la chiesa di Santa Sofia, un ampio tratto delle Mura e +la Rocca dei Rettori, unici esempi superstiti di architettura militare +longobarda, mentre altre testimonianze si sono conservate in centri minori del +ducato beneventano e a Spoleto. diff --git a/test/it/utf-8.txt b/test/it/utf-8.txt new file mode 100644 index 0000000..cefd085 --- /dev/null +++ b/test/it/utf-8.txt @@ -0,0 +1,18 @@ +L'architettura longobarda è costituita dall'insieme delle opere architettoniche +realizzate in Italia durante il regno dei Longobardi (568-774), con residuale +permanenza nell'Italia meridionale fino al X-XI secolo (Langobardia Minor), e +commissionate dai re e dai duchi longobardi. +L'attività architettonica sviluppata in Langobardia Maior è andata in gran parte +perduta, per lo più a causa di successive ricostruzioni degli edifici sacri e +profani eretti tra VII e VIII secolo. A parte il Tempietto longobardo di +Cividale del Friuli, rimasto in gran parte intatto, gli edifici civili e +religiosi di Pavia, Monza o altre località sono stati ampiamente rimaneggiati +nei secoli seguenti. Ancora integre rimangono così soltanto poche architetture, +o perché inglobate negli ampliamenti successivi - come la chiesa di San +Salvatore a Brescia) -, o perché periferiche e di modeste dimensioni - come la +chiesa di Santa Maria foris portas a Castelseprio. Testimonianze maggiormente +fedeli alla forma originale si ritrovano, invece, nella Langobardia Minor: a +Benevento si conservano la chiesa di Santa Sofia, un ampio tratto delle Mura e +la Rocca dei Rettori, unici esempi superstiti di architettura militare +longobarda, mentre altre testimonianze si sono conservate in centri minori del +ducato beneventano e a Spoleto.