LangModels: add Windows-1258 support for Vietnamese.

I was planning on adding VISCII support as well, but Python encode()
method does not have any support for it apparently, so I cannot generate
the proper statistics data with the current version of the string.
This commit is contained in:
Jehan 2016-02-13 02:32:57 +01:00
parent 27135a8880
commit 178c6119b8
12 changed files with 470 additions and 1 deletions

View File

@ -70,6 +70,8 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj
* Turkish: * Turkish:
* ISO-8859-3 * ISO-8859-3
* ISO-8859-9 * ISO-8859-9
* Vietnamese:
* Windows-1258
* Others * Others
* WINDOWS-1252 * WINDOWS-1252

View File

@ -0,0 +1,99 @@
= Logs of language model for Vietnamese (vi) =
- Generated by BuildLangModel.py
- Started: 2016-02-13 02:13:44.503931
- Maximum depth: 3
- Max number of pages: 40
== Parsed pages ==
Chữ_Quốc_ngữ (revision 22887853)
1651 (revision 21455247)
1773 (revision 21354755)
1815 (revision 21361292)
1838 (revision 21361314)
1865 (revision 21361338)
1869 (revision 21361342)
1888 (revision 21389506)
1902 (revision 21354811)
1918 (revision 21354828)
1919 (revision 21354829)
1938 (revision 21354849)
1945 (revision 21354857)
22 tháng 2 (revision 21376086)
26 tháng 11 (revision 22579845)
28 tháng 12 (revision 22475308)
A (revision 22549334)
ASCII (revision 22528409)
Alexandre de Rhodes (revision 22859954)
Antonio Barbosa (revision 22145269)
B (revision 22836557)
BBC (revision 22863903)
Biên khảo (revision 22531516)
Bán nguyên âm (revision 22655600)
Bình luận (revision 22117664)
Bảng chữ cái Bồ Đào Nha (revision 22887853)
Bảng chữ cái Hy Lạp (revision 21362081)
Bảng chữ cái Latinh (revision 22442448)
Bắc Kỳ (revision 22393289)
Bồ Đào Nha (revision 22620858)
C (revision 21341881)
Cao Xuân Dục (revision 22620201)
Chính tả (revision 22187359)
Chính tả tiếng Việt (revision 20897580)
Chữ Hán (revision 22889609)
Chữ Nôm (revision 22781506)
Chữ cái (revision 22169220)
Công giáo (revision 22173119)
D (revision 21447691)
== End of Parsed pages ==
- Wikipedia parsing ended at: 2016-02-13 02:16:03.731928
49 characters appeared 190798 times.
First 33 characters:
[ 0] Char n: 13.15212947724819 %
[ 1] Char h: 10.371702009455026 %
[ 2] Char t: 8.20134382959989 %
[ 3] Char c: 7.433516074591977 %
[ 4] Char i: 7.238545477415906 %
[ 5] Char g: 6.529418547364228 %
[ 6] Char a: 4.203922472981897 %
[ 7] Char u: 3.328127129215191 %
[ 8] Char m: 3.0540152412499086 %
[ 9] Char o: 3.037767691485236 %
[10] Char đ: 2.5948909317707733 %
[11] Char r: 2.4643864191448546 %
[12] Char à: 2.3878657008983324 %
[13] Char v: 2.269939936477322 %
[14] Char l: 2.2327278063711358 %
[15] Char á: 2.0482394993658217 %
[16] Char p: 1.9214037882996675 %
[17] Char b: 1.7998092223188922 %
[18] Char ư: 1.6813593433893437 %
[19] Char s: 1.6069350831769726 %
[20] Char y: 1.4952986928584158 %
[21] Char e: 1.4544177611924654 %
[22] Char d: 1.3139550729043281 %
[23] Char k: 1.2489648738456378 %
[24] Char â: 1.1278944223734 %
[25] Char ê: 0.977997672931582 %
[26] Char ô: 0.8260044654556128 %
[27] Char ó: 0.7091269300516777 %
[28] Char q: 0.60011111227581 %
[29] Char ơ: 0.4192916068302603 %
[30] Char í: 0.4166710342875712 %
[31] Char ă: 0.37998301868992335 %
[32] Char x: 0.34329500309227556 %
The first 33 characters have an accumulated ratio of 0.9887105734860954.
852 sequences found.
First 512 (typical positive ratio): 0.990048941203513
Next 512 (512-1024): 1.0482290170756506e-05
Rest: -1.5612511283791264e-17
- Processing end: 2016-02-13 02:16:03.877897

72
script/charsets/viscii.py Normal file
View File

@ -0,0 +1,72 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# ##### BEGIN LICENSE BLOCK #####
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Jehan <jehan@girinstud.io>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ##### END LICENSE BLOCK #####
from codepoints import *
name = 'VISCII'
aliases = ['csVISCII']
language = \
{
# Dedicated to Vietnamese.
'complete': ['vi'],
'incomplete': []
}
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
charmap = \
[
CTR,CTR,LET,CTR,CTR,LET,LET,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
CTR,CTR,CTR,CTR,LET,CTR,CTR,CTR,CTR,LET,CTR,CTR,CTR,CTR,LET,CTR, # 1X
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 9X
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # AX
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # BX
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # FX
]

View File

@ -0,0 +1,72 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# ##### BEGIN LICENSE BLOCK #####
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Jehan <jehan@girinstud.io>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ##### END LICENSE BLOCK #####
from codepoints import *
name = 'WINDOWS-1258'
aliases = ['cswindows1258']
language = \
{
# Dedicated to Vietnamese.
'complete': ['vi'],
'incomplete': []
}
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
charmap = \
[
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
SYM,ILL,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,LET,ILL,ILL,ILL, # 8X
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,LET,ILL,ILL,LET, # 9X
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # AX
SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET, # CX
LET,LET,SYM,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,SYM,LET, # DX
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET, # EX
LET,LET,SYM,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # FX
]

72
script/langs/vi.py Normal file
View File

@ -0,0 +1,72 @@
#!/bin/python3
# -*- coding: utf-8 -*-
# ##### BEGIN LICENSE BLOCK #####
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Jehan <jehan@girinstud.io>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ##### END LICENSE BLOCK #####
import re
## Mandatory Properties ##
name = 'Vietnamese'
code = 'vi'
# It actually uses ASCII, but not all of it.
use_ascii = False
# From Wikipedia:
# For systems that lack support for Unicode, dozens of 8-bit Vietnamese code
# pages are available.[1] The most common are VISCII (TCVN 5712:1993), VPS, and
# Windows-1258.[3] Where ASCII is required, such as when ensuring readability in
# plain text e-mail, Vietnamese letters are often encoded according to Vietnamese
# Quoted-Readable (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4] though usage of either
# variable-width scheme has declined dramatically following the adoption of
# Unicode on the World Wide Web.
charsets = ['WINDOWS-1258'] # TODO: add 'VISCII'
## Optional Properties ##
alphabet = 'aăâbcdđeêghiklmnoôơpqrstuưvxy'
start_pages = ['Chữ_Quốc_ngữ']
wikipedia_code = code
case_mapping = True
# A function to clean content returned by the `wikipedia` python lib,
# in case some unwanted data has been overlooked.
def clean_wikipedia_content(content):
cleaned = re.sub(r'(=+) *([^=]+) *\1',
r'\2',
content)
return cleaned

View File

@ -20,6 +20,7 @@ set(
LangModels/LangSpanishModel.cpp LangModels/LangSpanishModel.cpp
LangModels/LangThaiModel.cpp LangModels/LangThaiModel.cpp
LangModels/LangTurkishModel.cpp LangModels/LangTurkishModel.cpp
LangModels/LangVietnameseModel.cpp
nsHebrewProber.cpp nsHebrewProber.cpp
nsCharSetProber.cpp nsCharSetProber.cpp
nsBig5Prober.cpp nsBig5Prober.cpp

View File

@ -0,0 +1,139 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Communicator client code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
/********* Language model for: Vietnamese *********/
/**
* Generated by BuildLangModel.py
* On: 2016-02-13 02:16:03.733608
**/
/* Character Mapping Table:
* ILL: illegal character.
* CTR: control character specific to the charset.
* RET: carriage/return.
* SYM: symbol (punctuation) that does not belong to word.
* NUM: 0 - 9.
*
* Other characters are ordered by probabilities
* (0 is the most common character in the language).
*
* Orders are generic to a language. So the codepoint with order X in
* CHARSET1 maps to the same character as the codepoint with the same
* order X in CHARSET2 for the same language.
* As such, it is possible to get missing order. For instance the
* ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
* even though they are both used for French. Same for the euro sign.
*/
static const unsigned char Windows_1258_CharToOrderMap[] =
{
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 6, 17, 3, 22, 21, 35, 5, 1, 4, 38, 23, 14, 8, 0, 9, /* 4X */
16, 28, 11, 19, 2, 7, 13, 37, 32, 20, 39,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 6, 17, 3, 22, 21, 35, 5, 1, 4, 38, 23, 14, 8, 0, 9, /* 6X */
16, 28, 11, 19, 2, 7, 13, 37, 32, 20, 39,SYM,SYM,SYM,SYM,CTR, /* 7X */
SYM,ILL,SYM, 49,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 47,ILL,ILL,ILL, /* 8X */
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 47,ILL,ILL, 50, /* 9X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
SYM,SYM,SYM,SYM,SYM, 51,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
12, 15, 24, 31, 45, 52, 46, 41, 40, 34, 25, 53,SYM, 30, 54, 55, /* CX */
10, 56,SYM, 27, 26, 29, 42,SYM, 43, 33, 36, 57, 44, 18,SYM, 48, /* DX */
12, 15, 24, 31, 45, 58, 46, 41, 40, 34, 25, 59,SYM, 30, 60, 61, /* EX */
10, 62,SYM, 27, 26, 29, 42,SYM, 43, 33, 36, 63, 44, 18, 64, 65, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
/* Model Table:
* Total sequences: 852
* First 512 sequences: 0.990048941203513
* Next 512 sequences (512-1024): 0.00995105879648696
* Rest: -1.5612511283791264e-17
* Negative sequences: TODO
*/
static const PRUint8 VietnameseLangModel[] =
{
3,3,3,3,3,3,3,3,3,3,0,3,3,2,2,2,2,2,3,3,3,3,3,3,2,3,3,3,2,3,2,3,3,
3,2,3,0,3,0,3,3,3,3,0,3,3,0,2,3,0,0,3,2,3,3,2,2,3,3,3,3,0,3,3,3,0,
3,3,3,3,3,2,3,3,2,3,2,3,3,3,3,3,3,0,3,3,3,3,2,0,3,3,3,3,0,2,3,3,0,
3,3,3,3,3,0,3,3,2,3,0,3,3,2,3,3,2,2,3,2,3,3,2,3,3,0,3,3,2,3,0,3,2,
3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,0,3,2,3,3,3,0,3,2,2,3,0,0,2,3,
3,3,3,2,3,2,3,3,2,3,0,3,3,0,3,2,0,2,3,2,2,3,2,2,2,0,3,3,0,0,0,2,0,
3,3,3,3,3,3,2,3,3,3,0,3,0,3,3,0,3,3,0,3,3,3,3,3,0,0,0,0,2,0,2,0,3,
3,2,3,3,3,3,3,2,3,2,0,3,2,2,3,3,3,3,0,3,3,3,3,2,3,3,3,0,0,2,2,2,2,
3,2,0,3,3,0,3,3,3,3,0,0,3,0,3,3,3,3,3,3,2,3,2,0,2,0,3,3,0,0,0,2,2,
3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,0,3,3,3,3,3,0,0,0,2,0,0,0,3,2,
0,0,0,0,3,0,3,3,0,3,0,2,3,0,0,3,0,2,3,0,0,2,0,0,3,3,3,3,0,3,2,3,0,
3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,3,0,0,2,3,3,0,
3,0,0,0,3,0,0,3,3,3,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,0,0,3,0,3,3,0,3,0,2,3,0,2,2,0,0,3,2,2,3,2,0,3,2,3,0,0,0,3,3,0,
0,3,3,2,3,3,3,3,3,3,0,0,3,2,3,3,3,3,3,3,3,3,3,2,3,3,3,2,0,2,3,3,2,
3,0,3,3,3,2,0,3,3,3,0,2,0,0,2,0,3,0,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,0,3,0,3,3,2,3,0,3,0,0,3,0,3,2,0,2,3,3,2,0,0,2,0,2,2,0,2,0,0,
3,2,0,3,3,2,3,3,0,3,0,3,3,0,3,3,0,3,2,2,3,3,2,2,0,3,0,3,0,0,3,0,0,
3,0,2,2,2,0,3,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,
2,3,3,3,3,2,3,3,3,3,0,2,3,0,3,3,3,3,3,3,3,3,3,3,3,0,3,3,2,3,0,2,0,
3,0,2,2,2,0,3,2,2,3,0,3,0,0,2,2,2,2,0,3,0,3,2,2,0,3,0,0,0,0,0,0,0,
3,2,3,3,3,3,3,3,3,3,0,3,0,3,3,2,3,3,0,3,3,3,3,3,0,0,0,0,2,0,0,0,3,
2,2,2,2,3,2,3,3,0,3,0,3,3,2,2,2,0,0,3,2,3,3,3,2,3,2,0,2,0,2,0,2,0,
2,3,2,0,3,2,3,3,0,3,0,2,0,2,3,0,0,2,0,2,3,3,0,2,0,3,2,0,0,0,3,0,2,
3,0,2,2,0,0,0,3,3,0,0,0,0,0,2,0,2,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,3,2,0,0,0,3,3,0,2,2,0,0,0,0,2,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,
3,0,3,2,3,0,0,0,3,0,0,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,3,0,0,0,0,0,0,
3,0,2,3,3,0,3,0,3,0,0,2,0,2,0,0,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,3,0,0,0,3,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,2,2,3,0,0,2,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,3,3,0,0,3,2,2,0,0,0,0,0,2,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,2,2,2,0,0,0,3,0,0,2,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,
2,2,2,2,3,0,3,3,0,2,0,0,0,2,0,3,2,2,3,0,2,3,0,0,3,3,3,2,0,2,2,2,3,
};
const SequenceModel Windows_1258VietnameseModel =
{
Windows_1258_CharToOrderMap,
VietnameseLangModel,
33,
(float)0.990048941203513,
PR_FALSE,
"WINDOWS-1258"
};

View File

@ -104,6 +104,8 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[28] = new nsSingleByteCharSetProber(&Iso_8859_6ArabicModel); mProbers[28] = new nsSingleByteCharSetProber(&Iso_8859_6ArabicModel);
mProbers[29] = new nsSingleByteCharSetProber(&Windows_1256ArabicModel); mProbers[29] = new nsSingleByteCharSetProber(&Windows_1256ArabicModel);
mProbers[30] = new nsSingleByteCharSetProber(&Windows_1258VietnameseModel);
Reset(); Reset();
} }

View File

@ -40,7 +40,7 @@
#define nsSBCSGroupProber_h__ #define nsSBCSGroupProber_h__
#define NUM_OF_SBCS_PROBERS 30 #define NUM_OF_SBCS_PROBERS 31
class nsCharSetProber; class nsCharSetProber;
class nsSBCSGroupProber: public nsCharSetProber { class nsSBCSGroupProber: public nsCharSetProber {

View File

@ -162,5 +162,7 @@ extern const SequenceModel Iso_8859_3EsperantoModel;
extern const SequenceModel Iso_8859_3TurkishModel; extern const SequenceModel Iso_8859_3TurkishModel;
extern const SequenceModel Iso_8859_9TurkishModel; extern const SequenceModel Iso_8859_9TurkishModel;
extern const SequenceModel Windows_1258VietnameseModel;
#endif /* nsSingleByteCharSetProber_h__ */ #endif /* nsSingleByteCharSetProber_h__ */

4
test/vi/utf-8.txt Normal file
View File

@ -0,0 +1,4 @@
Chữ Quốc ngữ là hệ chữ viết thống nhất chính thức hiện nay của tiếng Việt, sử
dụng ký tự La Tinh, dựa trên các bảng chữ cái của nhóm ngôn ngữ Rôman,[1] đặc
biệt là bảng chữ cái Bồ Đào Nha,[2] với các dấu phụ chủ yếu từ bảng chữ cái Hy
Lạp.

4
test/vi/windows-1258.txt Normal file
View File

@ -0,0 +1,4 @@
ChýÞ Quôìc ngýÞ là hêò chýÞ viêìt thôìng nhâìt chính thýìc hiêòn nay cuÒa tiêìng Viêòt, sýÒ
duòng kyì týò La Tinh, dýòa trên các baÒng chýÞ cái cuÒa nhóm ngôn ngýÞ Rôman,[1] ðãòc
biêòt là baÒng chýÞ cái BôÌ Ðào Nha,[2] võìi các dâìu phuò chuÒ yêìu týÌ baÒng chýÞ cái Hy
Laòp.