From 178c6119b8e435dae38e276f2191637d04c327ec Mon Sep 17 00:00:00 2001 From: Jehan Date: Sat, 13 Feb 2016 02:32:57 +0100 Subject: [PATCH] LangModels: add Windows-1258 support for Vietnamese. I was planning on adding VISCII support as well, but Python encode() method does not have any support for it apparently, so I cannot generate the proper statistics data with the current version of the string. --- README.md | 2 + .../LangVietnameseModel.log | 99 +++++++++++++ script/charsets/viscii.py | 72 +++++++++ script/charsets/windows-1258.py | 72 +++++++++ script/langs/vi.py | 72 +++++++++ src/CMakeLists.txt | 1 + src/LangModels/LangVietnameseModel.cpp | 139 ++++++++++++++++++ src/nsSBCSGroupProber.cpp | 2 + src/nsSBCSGroupProber.h | 2 +- src/nsSBCharSetProber.h | 2 + test/vi/utf-8.txt | 4 + test/vi/windows-1258.txt | 4 + 12 files changed, 470 insertions(+), 1 deletion(-) create mode 100644 script/BuildLangModelLogs/LangVietnameseModel.log create mode 100644 script/charsets/viscii.py create mode 100644 script/charsets/windows-1258.py create mode 100644 script/langs/vi.py create mode 100644 src/LangModels/LangVietnameseModel.cpp create mode 100644 test/vi/utf-8.txt create mode 100644 test/vi/windows-1258.txt diff --git a/README.md b/README.md index 3e48ad7..a5ee815 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,8 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj * Turkish: * ISO-8859-3 * ISO-8859-9 + * Vietnamese: + * Windows-1258 * Others * WINDOWS-1252 diff --git a/script/BuildLangModelLogs/LangVietnameseModel.log b/script/BuildLangModelLogs/LangVietnameseModel.log new file mode 100644 index 0000000..abcfa81 --- /dev/null +++ b/script/BuildLangModelLogs/LangVietnameseModel.log @@ -0,0 +1,99 @@ += Logs of language model for Vietnamese (vi) = + +- Generated by BuildLangModel.py +- Started: 2016-02-13 02:13:44.503931 +- Maximum depth: 3 +- Max number of pages: 40 + +== Parsed pages == + +Chữ_Quốc_ngữ (revision 22887853) +1651 (revision 21455247) +1773 (revision 21354755) +1815 (revision 21361292) +1838 (revision 21361314) +1865 (revision 21361338) +1869 (revision 21361342) +1888 (revision 21389506) +1902 (revision 21354811) +1918 (revision 21354828) +1919 (revision 21354829) +1938 (revision 21354849) +1945 (revision 21354857) +22 tháng 2 (revision 21376086) +26 tháng 11 (revision 22579845) +28 tháng 12 (revision 22475308) +A (revision 22549334) +ASCII (revision 22528409) +Alexandre de Rhodes (revision 22859954) +Antonio Barbosa (revision 22145269) +B (revision 22836557) +BBC (revision 22863903) +Biên khảo (revision 22531516) +Bán nguyên âm (revision 22655600) +Bình luận (revision 22117664) +Bảng chữ cái Bồ Đào Nha (revision 22887853) +Bảng chữ cái Hy Lạp (revision 21362081) +Bảng chữ cái Latinh (revision 22442448) +Bắc Kỳ (revision 22393289) +Bồ Đào Nha (revision 22620858) +C (revision 21341881) +Cao Xuân Dục (revision 22620201) +Chính tả (revision 22187359) +Chính tả tiếng Việt (revision 20897580) +Chữ Hán (revision 22889609) +Chữ Nôm (revision 22781506) +Chữ cái (revision 22169220) +Công giáo (revision 22173119) +D (revision 21447691) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-02-13 02:16:03.731928 + +49 characters appeared 190798 times. + +First 33 characters: +[ 0] Char n: 13.15212947724819 % +[ 1] Char h: 10.371702009455026 % +[ 2] Char t: 8.20134382959989 % +[ 3] Char c: 7.433516074591977 % +[ 4] Char i: 7.238545477415906 % +[ 5] Char g: 6.529418547364228 % +[ 6] Char a: 4.203922472981897 % +[ 7] Char u: 3.328127129215191 % +[ 8] Char m: 3.0540152412499086 % +[ 9] Char o: 3.037767691485236 % +[10] Char đ: 2.5948909317707733 % +[11] Char r: 2.4643864191448546 % +[12] Char à: 2.3878657008983324 % +[13] Char v: 2.269939936477322 % +[14] Char l: 2.2327278063711358 % +[15] Char á: 2.0482394993658217 % +[16] Char p: 1.9214037882996675 % +[17] Char b: 1.7998092223188922 % +[18] Char ư: 1.6813593433893437 % +[19] Char s: 1.6069350831769726 % +[20] Char y: 1.4952986928584158 % +[21] Char e: 1.4544177611924654 % +[22] Char d: 1.3139550729043281 % +[23] Char k: 1.2489648738456378 % +[24] Char â: 1.1278944223734 % +[25] Char ê: 0.977997672931582 % +[26] Char ô: 0.8260044654556128 % +[27] Char ó: 0.7091269300516777 % +[28] Char q: 0.60011111227581 % +[29] Char ơ: 0.4192916068302603 % +[30] Char í: 0.4166710342875712 % +[31] Char ă: 0.37998301868992335 % +[32] Char x: 0.34329500309227556 % + +The first 33 characters have an accumulated ratio of 0.9887105734860954. + +852 sequences found. + +First 512 (typical positive ratio): 0.990048941203513 +Next 512 (512-1024): 1.0482290170756506e-05 +Rest: -1.5612511283791264e-17 + +- Processing end: 2016-02-13 02:16:03.877897 diff --git a/script/charsets/viscii.py b/script/charsets/viscii.py new file mode 100644 index 0000000..d20a902 --- /dev/null +++ b/script/charsets/viscii.py @@ -0,0 +1,72 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'VISCII' +aliases = ['csVISCII'] + +language = \ +{ + # Dedicated to Vietnamese. + 'complete': ['vi'], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,LET,CTR,CTR,LET,LET,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,LET,CTR,CTR,CTR,CTR,LET,CTR,CTR,CTR,CTR,LET,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 9X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # AX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # FX +] diff --git a/script/charsets/windows-1258.py b/script/charsets/windows-1258.py new file mode 100644 index 0000000..1a6c958 --- /dev/null +++ b/script/charsets/windows-1258.py @@ -0,0 +1,72 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'WINDOWS-1258' +aliases = ['cswindows1258'] + +language = \ +{ + # Dedicated to Vietnamese. + 'complete': ['vi'], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + SYM,ILL,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,LET,ILL,ILL,ILL, # 8X + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,LET,ILL,ILL,LET, # 9X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # AX + SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET, # CX + LET,LET,SYM,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,SYM,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET, # EX + LET,LET,SYM,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # FX +] diff --git a/script/langs/vi.py b/script/langs/vi.py new file mode 100644 index 0000000..5d49f92 --- /dev/null +++ b/script/langs/vi.py @@ -0,0 +1,72 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Vietnamese' +code = 'vi' +# It actually uses ASCII, but not all of it. +use_ascii = False +# From Wikipedia: +# For systems that lack support for Unicode, dozens of 8-bit Vietnamese code +# pages are available.[1] The most common are VISCII (TCVN 5712:1993), VPS, and +# Windows-1258.[3] Where ASCII is required, such as when ensuring readability in +# plain text e-mail, Vietnamese letters are often encoded according to Vietnamese +# Quoted-Readable (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4] though usage of either +# variable-width scheme has declined dramatically following the adoption of +# Unicode on the World Wide Web. +charsets = ['WINDOWS-1258'] # TODO: add 'VISCII' + +## Optional Properties ## + +alphabet = 'aăâbcdđeêghiklmnoôơpqrstuưvxy' +start_pages = ['Chữ_Quốc_ngữ'] +wikipedia_code = code +case_mapping = True + +# A function to clean content returned by the `wikipedia` python lib, +# in case some unwanted data has been overlooked. +def clean_wikipedia_content(content): + cleaned = re.sub(r'(=+) *([^=]+) *\1', + r'\2', + content) + return cleaned diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 901ce18..39f460c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -20,6 +20,7 @@ set( LangModels/LangSpanishModel.cpp LangModels/LangThaiModel.cpp LangModels/LangTurkishModel.cpp + LangModels/LangVietnameseModel.cpp nsHebrewProber.cpp nsCharSetProber.cpp nsBig5Prober.cpp diff --git a/src/LangModels/LangVietnameseModel.cpp b/src/LangModels/LangVietnameseModel.cpp new file mode 100644 index 0000000..8400be6 --- /dev/null +++ b/src/LangModels/LangVietnameseModel.cpp @@ -0,0 +1,139 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Vietnamese *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-02-13 02:16:03.733608 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Windows_1258_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 6, 17, 3, 22, 21, 35, 5, 1, 4, 38, 23, 14, 8, 0, 9, /* 4X */ + 16, 28, 11, 19, 2, 7, 13, 37, 32, 20, 39,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 6, 17, 3, 22, 21, 35, 5, 1, 4, 38, 23, 14, 8, 0, 9, /* 6X */ + 16, 28, 11, 19, 2, 7, 13, 37, 32, 20, 39,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 49,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 47,ILL,ILL,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 47,ILL,ILL, 50, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 51,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 12, 15, 24, 31, 45, 52, 46, 41, 40, 34, 25, 53,SYM, 30, 54, 55, /* CX */ + 10, 56,SYM, 27, 26, 29, 42,SYM, 43, 33, 36, 57, 44, 18,SYM, 48, /* DX */ + 12, 15, 24, 31, 45, 58, 46, 41, 40, 34, 25, 59,SYM, 30, 60, 61, /* EX */ + 10, 62,SYM, 27, 26, 29, 42,SYM, 43, 33, 36, 63, 44, 18, 64, 65, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 852 + * First 512 sequences: 0.990048941203513 + * Next 512 sequences (512-1024): 0.00995105879648696 + * Rest: -1.5612511283791264e-17 + * Negative sequences: TODO + */ +static const PRUint8 VietnameseLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,0,3,3,2,2,2,2,2,3,3,3,3,3,3,2,3,3,3,2,3,2,3,3, + 3,2,3,0,3,0,3,3,3,3,0,3,3,0,2,3,0,0,3,2,3,3,2,2,3,3,3,3,0,3,3,3,0, + 3,3,3,3,3,2,3,3,2,3,2,3,3,3,3,3,3,0,3,3,3,3,2,0,3,3,3,3,0,2,3,3,0, + 3,3,3,3,3,0,3,3,2,3,0,3,3,2,3,3,2,2,3,2,3,3,2,3,3,0,3,3,2,3,0,3,2, + 3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,0,3,2,3,3,3,0,3,2,2,3,0,0,2,3, + 3,3,3,2,3,2,3,3,2,3,0,3,3,0,3,2,0,2,3,2,2,3,2,2,2,0,3,3,0,0,0,2,0, + 3,3,3,3,3,3,2,3,3,3,0,3,0,3,3,0,3,3,0,3,3,3,3,3,0,0,0,0,2,0,2,0,3, + 3,2,3,3,3,3,3,2,3,2,0,3,2,2,3,3,3,3,0,3,3,3,3,2,3,3,3,0,0,2,2,2,2, + 3,2,0,3,3,0,3,3,3,3,0,0,3,0,3,3,3,3,3,3,2,3,2,0,2,0,3,3,0,0,0,2,2, + 3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,0,3,3,3,3,3,0,0,0,2,0,0,0,3,2, + 0,0,0,0,3,0,3,3,0,3,0,2,3,0,0,3,0,2,3,0,0,2,0,0,3,3,3,3,0,3,2,3,0, + 3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,3,0,0,2,3,3,0, + 3,0,0,0,3,0,0,3,3,3,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,0,0,3,0,3,3,0,3,0,2,3,0,2,2,0,0,3,2,2,3,2,0,3,2,3,0,0,0,3,3,0, + 0,3,3,2,3,3,3,3,3,3,0,0,3,2,3,3,3,3,3,3,3,3,3,2,3,3,3,2,0,2,3,3,2, + 3,0,3,3,3,2,0,3,3,3,0,2,0,0,2,0,3,0,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0, + 0,3,3,0,3,0,3,3,2,3,0,3,0,0,3,0,3,2,0,2,3,3,2,0,0,2,0,2,2,0,2,0,0, + 3,2,0,3,3,2,3,3,0,3,0,3,3,0,3,3,0,3,2,2,3,3,2,2,0,3,0,3,0,0,3,0,0, + 3,0,2,2,2,0,3,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0, + 2,3,3,3,3,2,3,3,3,3,0,2,3,0,3,3,3,3,3,3,3,3,3,3,3,0,3,3,2,3,0,2,0, + 3,0,2,2,2,0,3,2,2,3,0,3,0,0,2,2,2,2,0,3,0,3,2,2,0,3,0,0,0,0,0,0,0, + 3,2,3,3,3,3,3,3,3,3,0,3,0,3,3,2,3,3,0,3,3,3,3,3,0,0,0,0,2,0,0,0,3, + 2,2,2,2,3,2,3,3,0,3,0,3,3,2,2,2,0,0,3,2,3,3,3,2,3,2,0,2,0,2,0,2,0, + 2,3,2,0,3,2,3,3,0,3,0,2,0,2,3,0,0,2,0,2,3,3,0,2,0,3,2,0,0,0,3,0,2, + 3,0,2,2,0,0,0,3,3,0,0,0,0,0,2,0,2,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,3,2,0,0,0,3,3,0,2,2,0,0,0,0,2,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0, + 3,0,3,2,3,0,0,0,3,0,0,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,3,0,0,0,0,0,0, + 3,0,2,3,3,0,3,0,3,0,0,2,0,2,0,0,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,3,0,0,0,3,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,2,2,3,0,0,2,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,3,3,0,0,3,2,2,0,0,0,0,0,2,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,2,2,2,0,0,0,3,0,0,2,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0, + 2,2,2,2,3,0,3,3,0,2,0,0,0,2,0,3,2,2,3,0,2,3,0,0,3,3,3,2,0,2,2,2,3, +}; + + +const SequenceModel Windows_1258VietnameseModel = +{ + Windows_1258_CharToOrderMap, + VietnameseLangModel, + 33, + (float)0.990048941203513, + PR_FALSE, + "WINDOWS-1258" +}; \ No newline at end of file diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 89d5b8c..78c3e3d 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -104,6 +104,8 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[28] = new nsSingleByteCharSetProber(&Iso_8859_6ArabicModel); mProbers[29] = new nsSingleByteCharSetProber(&Windows_1256ArabicModel); + mProbers[30] = new nsSingleByteCharSetProber(&Windows_1258VietnameseModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index fe75b77..a846c8d 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 30 +#define NUM_OF_SBCS_PROBERS 31 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index e161b2e..7f24cf5 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -162,5 +162,7 @@ extern const SequenceModel Iso_8859_3EsperantoModel; extern const SequenceModel Iso_8859_3TurkishModel; extern const SequenceModel Iso_8859_9TurkishModel; +extern const SequenceModel Windows_1258VietnameseModel; + #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/test/vi/utf-8.txt b/test/vi/utf-8.txt new file mode 100644 index 0000000..c82798d --- /dev/null +++ b/test/vi/utf-8.txt @@ -0,0 +1,4 @@ +Chữ Quốc ngữ là hệ chữ viết thống nhất chính thức hiện nay của tiếng Việt, sử +dụng ký tự La Tinh, dựa trên các bảng chữ cái của nhóm ngôn ngữ Rôman,[1] đặc +biệt là bảng chữ cái Bồ Đào Nha,[2] với các dấu phụ chủ yếu từ bảng chữ cái Hy +Lạp. diff --git a/test/vi/windows-1258.txt b/test/vi/windows-1258.txt new file mode 100644 index 0000000..53526db --- /dev/null +++ b/test/vi/windows-1258.txt @@ -0,0 +1,4 @@ +Ch Quc ng l h ch vit thng nht chnh thc hin nay cua ting Vit, s +dung ky t La Tinh, da trn cc bang ch ci cua nhm ngn ng Rman,[1] c +bit l bang ch ci B o Nha,[2] vi cc du phu chu yu t bang ch ci Hy +Lap.