LangModels: add ISO-8859-11 and regenerate TIS-620 Thai models.

ISO-8859-11 is basically exactly identical to TIS-620, with the added
non-breaking space character.
Basically our detection will always return TIS-620 except for
exceptional cases when a text has a non-breaking space.
This commit is contained in:
Jehan 2015-12-04 03:11:09 +01:00
parent ffcd85f709
commit fb3c47a073
9 changed files with 464 additions and 193 deletions

View File

@ -42,6 +42,7 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj
* WINDOWS-1255
* Thai
* TIS-620
* ISO-8859-11
* Esperanto
* ISO-8859-3
* French

View File

@ -0,0 +1,77 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# ##### BEGIN LICENSE BLOCK #####
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Jehan <jehan@girinstud.io>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ##### END LICENSE BLOCK #####
from codepoints import *
# ISO-8859-1 is the full 8-bit range, IANA-defined, superset of ISO/CEI 8859-1.
# It is basically the same as ISO/CEI 8859-1, but with control characters.
# As far as I can see, `iconv` has no support for the ISO/CEI 8859-1 subset,
# so there is no need for us to support it anyway.
name = 'ISO-8859-11'
aliases = []
language = \
{
# Designed for Thai language.
'complete': ['th'],
'incomplete': []
}
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
charmap = \
[
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # AX
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # BX
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
LET,SYM,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,ILL,ILL,ILL,SYM, # DX
LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET, # EX
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,LET,LET,ILL,ILL,ILL,ILL, # FX
]

View File

@ -0,0 +1,77 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# ##### BEGIN LICENSE BLOCK #####
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Jehan <jehan@girinstud.io>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ##### END LICENSE BLOCK #####
from codepoints import *
# ISO-8859-1 is the full 8-bit range, IANA-defined, superset of ISO/CEI 8859-1.
# It is basically the same as ISO/CEI 8859-1, but with control characters.
# As far as I can see, `iconv` has no support for the ISO/CEI 8859-1 subset,
# so there is no need for us to support it anyway.
name = 'TIS-620'
aliases = []
language = \
{
# Designed for Thai language.
'complete': ['th'],
'incomplete': []
}
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
charmap = \
[
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
ILL,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # AX
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # BX
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
LET,SYM,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,ILL,ILL,ILL,SYM, # DX
LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET, # EX
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,LET,LET,ILL,ILL,ILL,ILL, # FX
]

62
script/langs/th.py Normal file
View File

@ -0,0 +1,62 @@
#!/bin/python3
# -*- coding: utf-8 -*-
# ##### BEGIN LICENSE BLOCK #####
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Jehan <jehan@girinstud.io>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ##### END LICENSE BLOCK #####
import re
## Mandatory Properties ##
name = 'Thai'
code = 'th'
use_ascii = False
charsets = ['ISO-8859-11', 'TIS-620']
## Optional Properties ##
# The start page. Though optional, it is advised to choose one yourself.
start_pages = ['หน้าหลัก']
wikipedia_code = code
case_mapping = False
def clean_wikipedia_content(content):
# Get rid of title syntax: "=== Articles connexes ==="
cleaned = re.sub(r'(=+) *([^=]+) *\1',
r'\2',
content)
return cleaned

View File

@ -37,186 +37,229 @@
#include "../nsSBCharSetProber.h"
/********* Language model for: Thai *********/
/****************************************************************
CTR: Control characters that usually does not exist in any text
RET: Carriage/Return
SYM: symbol (punctuation) that does not belong to word
NUM: 0 - 9
/**
* Generated by BuildLangModel.py
* On: 2015-12-04 03:05:06.182099
**/
*****************************************************************/
//The following result for thai was collected from a limited sample (1M).
//Character Mapping Table:
static const unsigned char TIS620CharToOrderMap[] =
/* Character Mapping Table:
* ILL: illegal character.
* CTR: control character specific to the charset.
* RET: carriage/return.
* SYM: symbol (punctuation) that does not belong to word.
* NUM: 0 - 9.
*
* Other characters are ordered by probabilities
* (0 is the most common character in the language).
*
* Orders are generic to a language. So the codepoint with order X in
* CHARSET1 maps to the same character as the codepoint with the same
* order X in CHARSET2 for the same language.
* As such, it is possible to get missing order. For instance the
* ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
* even though they are both used for French. Same for the euro sign.
*/
static const unsigned char Tis_620_CharToOrderMap[] =
{
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30
SYM,182,106,107,100,183,184,185,101, 94,186,187,108,109,110,111, //40
188,189,190, 89, 95,112,113,191,192,193,194,SYM,SYM,SYM,SYM,SYM, //50
SYM, 64, 72, 73,114, 74,115,116,102, 81,201,117, 90,103, 78, 82, //60
96,202, 91, 79, 84,104,105, 97, 98, 92,203,SYM,SYM,SYM,SYM,SYM, //70
209,210,211,212,213, 88,214,215,216,217,218,219,220,118,221,222,
223,224, 99, 85, 83,225,226,227,228,229,230,231,232,233,234,235,
ILL, 5, 30,237, 24,238, 75, 8, 26, 52, 34, 51,119, 47, 58, 57,
49, 53, 55, 43, 20, 19, 44, 14, 48, 3, 17, 25, 39, 62, 31, 54,
45, 9, 16, 2, 61, 15,239, 12, 42, 46, 18, 21, 76, 4, 66, 63,
22, 10, 1, 36, 23, 13, 40, 27, 32, 35, 86,ILL,ILL,ILL,ILL,244,
11, 28, 41, 29, 33,245, 50, 37, 6, 7, 67, 77, 38, 93,246,247,
68, 56, 59, 65, 69, 60, 70, 80, 71, 87,248,249,ILL,ILL,ILL,ILL,
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 66, 70, 67, 80, 78, 87, 85, 73, 79, 93, 88, 84, 68, 77, 81, /* 4X */
75,101, 74, 61, 71, 86, 96, 90,103,100, 99,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 35, 64, 48, 52, 32, 60, 65, 54, 36, 97, 76, 46, 56, 41, 40, /* 6X */
59,104, 43, 45, 44, 55, 72, 82, 94, 57, 92,SYM,SYM,SYM,SYM,CTR, /* 7X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
ILL, 3, 23,105, 15,106, 89, 5, 21, 63, 26, 31,102, 42, 69, 58, /* AX */
49, 91, 83, 34, 9, 17, 30, 12, 39, 1, 16, 19, 33, 62, 22, 47, /* BX */
38, 7, 10, 2, 50, 11,107, 8, 28, 37, 13, 18, 98, 4, 53, 95, /* CX */
14,SYM, 0, 29,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,ILL,ILL,ILL,SYM, /* DX */
6, 20, 27, 24, 25,108, 51,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,109, /* EX */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,110,111,ILL,ILL,ILL,ILL, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
static const unsigned char Iso_8859_11_CharToOrderMap[] =
{
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 66, 70, 67, 80, 78, 87, 85, 73, 79, 93, 88, 84, 68, 77, 81, /* 4X */
75,101, 74, 61, 71, 86, 96, 90,103,100, 99,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 35, 64, 48, 52, 32, 60, 65, 54, 36, 97, 76, 46, 56, 41, 40, /* 6X */
59,104, 43, 45, 44, 55, 72, 82, 94, 57, 92,SYM,SYM,SYM,SYM,CTR, /* 7X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
SYM, 3, 23,112, 15,113, 89, 5, 21, 63, 26, 31,102, 42, 69, 58, /* AX */
49, 91, 83, 34, 9, 17, 30, 12, 39, 1, 16, 19, 33, 62, 22, 47, /* BX */
38, 7, 10, 2, 50, 11,114, 8, 28, 37, 13, 18, 98, 4, 53, 95, /* CX */
14,SYM, 0, 29,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,ILL,ILL,ILL,SYM, /* DX */
6, 20, 27, 24, 25,115, 51,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,116, /* EX */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,117,118,ILL,ILL,ILL,ILL, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
/* Model Table:
* Total sequences: 2324
* First 512 sequences: 0.8815720594354438
* Next 512 sequences (512-1024): 0.0920860122682917
* Rest: 0.026341928296264486
* Negative sequences: TODO
*/
static const PRUint8 ThaiLangModel[] =
{
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,3,
0,2,3,0,0,3,2,3,0,0,2,0,0,0,0,2,0,1,1,1,0,2,0,0,0,0,1,0,0,0,1,1,
3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,
0,3,0,0,0,1,3,3,0,0,1,0,0,0,0,2,0,2,1,2,0,1,0,0,0,0,0,0,0,0,2,1,
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,3,3,3,3,3,2,2,2,3,1,3,2,
0,2,3,0,0,2,2,1,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,2,1,
3,3,3,3,3,2,3,3,3,3,2,3,3,3,2,3,2,3,3,3,3,3,3,3,3,3,2,3,2,3,2,3,
0,2,1,0,0,3,2,1,0,0,0,0,0,0,0,1,0,3,3,1,0,1,0,0,0,0,3,0,0,0,1,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,2,2,2,3,3,2,2,1,2,2,2,
0,2,0,0,0,0,2,2,0,0,1,0,0,0,0,2,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,1,
3,3,3,3,3,2,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,
0,3,0,0,0,1,2,2,0,0,1,0,0,0,0,2,0,1,1,2,0,2,0,0,0,0,0,0,0,0,2,1,
0,3,3,3,3,2,0,3,3,3,3,3,3,3,0,3,3,3,3,3,0,3,3,3,0,0,3,0,3,0,1,3,
0,2,0,0,0,2,2,2,0,0,0,0,0,0,0,3,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,3,
3,3,3,3,3,2,3,3,3,3,2,3,3,3,2,3,3,3,3,3,3,2,3,3,3,3,2,2,1,0,2,1,
0,2,2,0,1,2,2,1,0,0,1,0,0,0,0,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0,1,1,
3,3,3,3,3,3,3,3,2,3,3,3,3,2,2,2,3,2,2,2,3,3,3,2,2,2,2,2,2,0,2,2,
0,1,2,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,3,1,0,1,0,0,0,0,0,0,0,0,1,1,
3,3,3,3,3,3,3,2,3,2,3,3,3,3,0,3,2,3,2,2,3,2,2,3,3,3,2,2,1,3,2,1,
0,1,0,0,0,0,2,1,0,0,0,0,0,0,0,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0,1,1,
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,2,1,2,2,
0,2,0,0,0,0,3,1,0,0,1,0,0,0,0,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0,1,1,
3,3,2,3,3,3,3,3,3,3,2,3,3,3,3,2,2,3,2,2,2,2,1,3,2,2,2,2,1,3,1,2,
0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,
3,3,3,1,2,1,2,1,2,3,3,1,1,2,2,3,2,1,2,1,1,1,2,1,1,1,1,1,3,3,0,1,
0,0,0,0,0,1,1,3,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,2,3,3,3,3,3,3,2,3,2,2,2,2,3,3,3,2,2,1,1,1,2,2,1,2,1,3,3,2,
0,1,0,0,0,0,2,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
0,3,3,3,3,1,3,3,3,3,3,2,3,3,0,3,3,3,3,3,3,3,3,2,3,3,3,3,2,0,2,2,
0,2,1,0,0,0,2,2,0,0,1,0,0,0,0,1,0,1,1,0,0,2,0,0,0,0,1,0,0,0,1,1,
3,3,3,1,3,2,2,3,3,2,2,3,1,1,2,2,1,2,1,2,1,3,1,1,1,1,1,2,0,3,0,1,
0,0,2,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,
3,3,3,3,3,1,3,2,3,3,2,3,3,3,1,3,3,3,3,3,3,2,2,2,3,3,2,2,2,2,2,2,
0,2,0,0,0,0,2,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,1,0,0,0,1,1,
3,3,3,3,3,1,2,1,2,1,3,2,2,2,3,1,2,2,1,1,2,1,1,2,2,1,1,2,1,3,3,1,
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,3,3,1,2,1,0,3,3,1,2,3,1,1,1,0,0,3,1,1,0,0,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,2,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,2,3,3,3,1,2,1,2,2,2,3,2,2,2,1,1,2,1,2,2,2,1,1,2,2,1,1,1,0,2,1,
0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,3,0,0,0,0,0,
0,3,3,3,3,1,0,3,2,2,2,3,3,3,0,3,3,3,3,3,0,1,2,2,0,0,1,0,0,0,3,3,
0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,
3,3,3,3,3,1,3,2,2,2,1,1,2,2,3,2,1,2,1,1,2,3,3,2,2,2,1,2,0,3,1,2,
0,1,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
3,1,3,2,3,1,2,2,3,2,3,3,3,2,0,1,3,1,1,1,2,2,1,2,1,1,1,1,1,1,1,0,
0,1,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,1,1,3,0,1,1,2,1,2,1,2,1,0,1,1,2,1,1,1,1,1,1,1,1,1,1,0,1,1,1,
0,0,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,0,3,0,0,0,0,0,2,1,0,0,2,0,1,1,3,3,1,0,3,0,0,0,0,3,0,0,0,0,0,
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,3,2,2,0,0,3,3,3,0,2,3,1,0,2,2,2,2,3,0,1,1,3,0,0,1,0,0,0,1,2,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,
3,3,1,2,3,1,2,2,2,1,2,2,2,2,1,1,2,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,
0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,3,3,2,3,0,0,2,1,3,2,3,3,1,0,3,2,3,1,2,0,2,2,1,0,0,1,0,1,0,1,2,
0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,
3,3,2,2,2,0,2,2,2,1,2,1,2,2,0,1,1,2,1,1,2,2,1,2,2,2,1,1,1,0,1,1,
0,0,0,0,0,2,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,
0,3,3,3,2,2,3,2,2,2,1,3,2,2,0,3,2,2,3,1,3,1,2,2,3,2,1,2,1,0,2,1,
0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,
3,2,1,1,2,1,2,2,2,1,1,2,2,1,1,1,2,1,1,1,2,1,1,1,2,1,1,1,1,0,1,0,
0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,3,1,1,3,2,2,1,1,1,1,2,1,0,1,1,1,2,0,1,1,0,0,0,0,1,1,1,0,0,0,1,
0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
2,0,0,2,2,0,0,0,2,3,0,3,2,3,3,0,2,0,0,0,2,0,1,2,2,1,0,2,2,1,0,0,
1,2,0,1,0,1,1,1,1,1,2,3,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,2,2,1,1,1,1,1,1,1,1,2,2,3,1,1,1,2,1,1,1,1,1,1,1,1,1,1,0,1,1,
0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,1,2,0,0,0,1,3,0,3,3,2,3,0,2,0,0,0,2,0,1,1,2,2,0,2,1,1,0,0,
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,3,1,0,0,0,3,3,0,2,3,3,2,0,3,0,0,0,2,0,1,1,2,0,0,1,1,0,0,0,
3,1,1,2,1,0,1,1,1,1,2,0,2,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,0,1,1,
0,1,3,0,0,1,2,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,1,0,0,0,1,0,
3,0,2,1,1,0,0,1,0,0,1,0,2,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,3,1,2,1,1,2,1,1,1,0,1,1,0,1,1,1,1,1,1,0,0,1,1,1,0,1,1,1,0,0,
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,1,1,0,0,0,1,3,0,3,2,2,2,0,2,0,0,0,2,0,1,2,2,1,0,2,3,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
3,0,0,2,2,0,0,0,2,2,0,1,3,2,1,0,2,0,0,0,3,0,1,1,1,1,0,0,1,0,0,0,
3,1,1,1,1,0,2,1,1,0,0,1,2,1,0,1,1,1,2,1,1,1,1,1,2,1,2,1,1,0,1,1,
0,0,0,0,0,0,1,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,0,3,3,0,0,0,2,2,0,2,2,2,1,0,2,0,0,0,2,0,1,1,1,2,0,1,1,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,0,2,3,0,0,0,2,1,0,2,2,2,1,0,1,0,0,0,1,0,3,2,1,2,0,1,1,0,0,0,
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,1,2,0,0,0,2,1,0,1,3,2,1,0,2,0,0,0,1,0,2,1,1,1,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,0,2,2,0,0,0,2,2,0,0,1,1,2,0,1,0,0,0,1,0,1,1,1,1,0,1,1,0,0,0,
1,1,3,2,2,0,2,1,1,1,1,2,1,1,0,1,1,2,1,0,1,1,1,1,1,1,1,1,0,0,0,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,2,2,0,0,0,2,0,0,1,2,1,1,0,1,0,0,0,0,0,2,1,0,1,0,0,0,0,0,0,
3,1,1,1,2,0,1,2,1,0,0,0,1,2,0,1,2,1,1,1,1,0,0,0,1,1,0,1,1,0,0,1,
0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,3,0,0,0,0,0,2,0,0,1,0,0,1,0,2,2,0,0,1,0,0,0,0,0,0,2,0,1,0,
0,0,0,0,0,3,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,1,1,0,1,1,1,0,0,0,1,0,0,1,0,1,0,0,1,0,1,1,1,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,2,2,0,0,0,2,0,0,1,0,1,1,0,1,0,0,0,1,0,1,1,1,2,0,0,2,0,0,0,
2,1,1,0,2,0,2,1,1,1,1,2,1,1,1,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,0,2,2,0,0,0,2,1,0,1,1,1,1,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,1,1,0,0,0,0,2,0,2,2,2,2,0,2,0,0,0,2,0,1,0,1,1,0,1,1,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,2,2,0,0,0,2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,2,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,1,1,0,0,0,1,1,0,0,1,2,1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,
1,0,1,2,1,0,1,1,1,1,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,0,0,0,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,2,1,0,0,0,2,0,0,2,1,1,2,0,0,0,0,0,0,0,2,1,1,2,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,1,2,0,0,0,2,1,0,2,1,0,0,0,0,0,0,0,0,0,0,2,0,1,0,0,2,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,1,1,0,0,0,1,0,0,0,2,0,0,0,2,0,0,0,0,0,1,1,1,1,0,1,1,1,0,0,
0,1,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,1,1,0,0,0,0,1,1,1,1,2,0,0,1,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};
//Model Table:
//total sequences: 100%
//first 512 sequences: 92.6386%
//first 1024 sequences:7.3177%
//rest sequences: 1.0230%
//negative sequences: 0.0436%
static const PRUint8 ThaiLangModel[] =
const SequenceModel Tis_620ThaiModel =
{
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
3,0,3,3,2,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,0,3,2,3,0,2,2,2,3,
0,2,3,0,0,0,0,1,0,1,2,3,1,1,3,2,2,0,1,1,0,0,1,0,0,0,0,0,0,0,1,1,
3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,3,3,2,3,2,3,3,2,2,2,
3,1,2,3,0,3,3,2,2,1,2,3,3,1,2,0,1,3,0,1,0,0,1,0,0,0,0,0,0,0,1,1,
3,3,2,2,3,3,3,3,1,2,3,3,3,3,3,2,2,2,2,3,3,2,2,3,3,2,2,3,2,3,2,2,
3,3,1,2,3,1,2,2,3,3,1,0,2,1,0,0,3,1,2,1,0,0,1,0,0,0,0,0,0,1,0,1,
3,3,3,3,3,3,2,2,3,3,3,3,2,3,2,2,3,3,2,2,3,2,2,2,2,1,1,3,1,2,1,1,
3,2,1,0,2,1,0,1,0,1,1,0,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,
3,3,3,2,3,2,3,3,2,2,3,2,3,3,2,3,1,1,2,3,2,2,2,3,2,2,2,2,2,1,2,1,
2,2,1,1,3,3,2,1,0,1,2,2,0,1,3,0,0,0,1,1,0,0,0,0,0,2,3,0,0,2,1,1,
3,3,2,3,3,2,0,0,3,3,0,3,3,0,2,2,3,1,2,2,1,1,1,0,2,2,2,0,2,2,1,1,
0,2,1,0,2,0,0,2,0,1,0,0,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,
3,3,2,3,3,2,0,0,3,3,0,2,3,0,2,1,2,2,2,2,1,2,0,0,2,2,2,0,2,2,1,1,
0,2,1,0,2,0,0,2,0,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,
3,3,2,3,2,3,2,0,2,2,1,3,2,1,3,2,1,2,3,2,2,3,0,2,3,2,2,1,2,2,2,2,
1,2,2,0,0,0,0,2,0,1,2,0,1,1,1,0,1,0,3,1,1,0,0,0,0,0,0,0,0,0,1,0,
3,3,2,3,3,2,3,2,2,2,3,2,2,3,2,2,1,2,3,2,2,3,1,3,2,2,2,3,2,2,2,3,
3,2,1,3,0,1,1,1,0,2,1,1,1,1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,2,0,0,
1,0,0,3,0,3,3,3,3,3,0,0,3,0,2,2,3,3,3,3,3,0,0,0,1,1,3,0,0,0,0,2,
0,0,1,0,0,0,0,0,0,0,2,3,0,0,0,3,0,2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,
2,0,3,3,3,3,0,0,2,3,0,0,3,0,3,3,2,3,3,3,3,3,0,0,3,3,3,0,0,0,3,3,
0,0,3,0,0,0,0,2,0,0,2,1,1,3,0,0,1,0,0,2,3,0,1,0,0,0,0,0,0,0,1,0,
3,3,3,3,2,3,3,3,3,3,3,3,1,2,1,3,3,2,2,1,2,2,2,3,1,1,2,0,2,1,2,1,
2,2,1,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,
3,0,2,1,2,3,3,3,0,2,0,2,2,0,2,1,3,2,2,1,2,1,0,0,2,2,1,0,2,1,2,2,
0,1,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,2,1,3,3,1,1,3,0,2,3,1,1,3,2,1,1,2,0,2,2,3,2,1,1,1,1,1,2,
3,0,0,1,3,1,2,1,2,0,3,0,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
3,3,1,1,3,2,3,3,3,1,3,2,1,3,2,1,3,2,2,2,2,1,3,3,1,2,1,3,1,2,3,0,
2,1,1,3,2,2,2,1,2,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,
3,3,2,3,2,3,3,2,3,2,3,2,3,3,2,1,0,3,2,2,2,1,2,2,2,1,2,2,1,2,1,1,
2,2,2,3,0,1,3,1,1,1,1,0,1,1,0,2,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,2,3,2,2,1,1,3,2,3,2,3,2,0,3,2,2,1,2,0,2,2,2,1,2,2,2,2,1,
3,2,1,2,2,1,0,2,0,1,0,0,1,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,2,3,1,2,3,3,2,2,3,0,1,1,2,0,3,3,2,2,3,0,1,1,3,0,0,0,0,
3,1,0,3,3,0,2,0,2,1,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,2,3,2,3,3,0,1,3,1,1,2,1,2,1,1,3,1,1,0,2,3,1,1,1,1,1,1,1,1,
3,1,1,2,2,2,2,1,1,1,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
3,2,2,1,1,2,1,3,3,2,3,2,2,3,2,2,3,1,2,2,1,2,0,3,2,1,2,2,2,2,2,1,
3,2,1,2,2,2,1,1,1,1,0,0,1,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,1,3,3,0,2,1,0,3,2,0,0,3,1,0,1,1,0,1,0,0,0,0,0,1,
1,0,0,1,0,3,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,2,2,2,3,0,0,1,3,0,3,2,0,3,2,2,3,3,3,3,3,1,0,2,2,2,0,2,2,1,2,
0,2,3,0,0,0,0,1,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
3,0,2,3,1,3,3,2,3,3,0,3,3,0,3,2,2,3,2,3,3,3,0,0,2,2,3,0,1,1,1,3,
0,0,3,0,0,0,2,2,0,1,3,0,1,2,2,2,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,
3,2,3,3,2,0,3,3,2,2,3,1,3,2,1,3,2,0,1,2,2,0,2,3,2,1,0,3,0,0,0,0,
3,0,0,2,3,1,3,0,0,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,3,2,2,2,1,2,0,1,3,1,1,3,1,3,0,0,2,1,1,1,1,2,1,1,1,0,2,1,0,1,
1,2,0,0,0,3,1,1,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,3,1,0,0,0,1,0,
3,3,3,3,2,2,2,2,2,1,3,1,1,1,2,0,1,1,2,1,2,1,3,2,0,0,3,1,1,1,1,1,
3,1,0,2,3,0,0,0,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,2,3,0,3,3,0,2,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,2,3,1,3,0,0,1,2,0,0,2,0,3,3,2,3,3,3,2,3,0,0,2,2,2,0,0,0,2,2,
0,0,1,0,0,0,0,3,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
0,0,0,3,0,2,0,0,0,0,0,0,0,0,0,0,1,2,3,1,3,3,0,0,1,0,3,0,0,0,0,0,
0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,1,2,3,1,2,3,1,0,3,0,2,2,1,0,2,1,1,2,0,1,0,0,1,1,1,1,0,1,0,0,
1,0,0,0,0,1,1,0,3,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,2,1,0,1,1,1,3,1,2,2,2,2,2,2,1,1,1,1,0,3,1,0,1,3,1,1,1,1,
1,1,0,2,0,1,3,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,
3,0,2,2,1,3,3,2,3,3,0,1,1,0,2,2,1,2,1,3,3,1,0,0,3,2,0,0,0,0,2,1,
0,1,0,0,0,0,1,2,0,1,1,3,1,1,2,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,3,0,0,1,0,0,0,3,0,0,3,0,3,1,0,1,1,1,3,2,0,0,0,3,0,0,0,0,2,0,
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,
3,3,1,3,2,1,3,3,1,2,2,0,1,2,1,0,1,2,0,0,0,0,0,3,0,0,0,3,0,0,0,0,
3,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,1,2,0,3,3,3,2,2,0,1,1,0,1,3,0,0,0,2,2,0,0,0,0,3,1,0,1,0,0,0,
0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,2,3,1,2,0,0,2,1,0,3,1,0,1,2,0,1,1,1,1,3,0,0,3,1,1,0,2,2,1,1,
0,2,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,0,3,1,2,0,0,2,2,0,1,2,0,1,0,1,3,1,2,1,0,0,0,2,0,3,0,0,0,1,0,
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,1,1,2,2,0,0,0,2,0,2,1,0,1,1,0,1,1,1,2,1,0,0,1,1,1,0,2,1,1,1,
0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,
0,0,0,2,0,1,3,1,1,1,1,0,0,0,0,3,2,0,1,0,0,0,1,2,0,0,0,1,0,0,0,0,
0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,3,3,3,3,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,2,3,2,2,0,0,0,1,0,0,0,0,2,3,2,1,2,2,3,0,0,0,2,3,1,0,0,0,1,1,
0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,
3,3,2,2,0,1,0,0,0,0,2,0,2,0,1,0,0,0,1,1,0,0,0,2,1,0,1,0,1,1,0,0,
0,1,0,2,0,0,1,0,3,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,1,0,0,1,0,0,0,0,0,1,1,2,0,0,0,0,1,0,0,1,3,1,0,0,0,0,1,1,0,0,
0,1,0,0,0,0,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,
3,3,1,1,1,1,2,3,0,0,2,1,1,1,1,1,0,2,1,1,0,0,0,2,1,0,1,2,1,1,0,1,
2,1,0,3,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,3,1,0,0,0,0,0,0,0,3,0,0,0,3,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,
0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,2,0,0,0,0,0,0,1,2,1,0,1,1,0,2,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,2,0,0,0,1,3,0,1,0,0,0,2,0,0,0,0,0,0,0,1,2,0,0,0,0,0,
3,3,0,0,1,1,2,0,0,1,2,1,0,1,1,1,0,1,1,0,0,2,1,1,0,1,0,0,1,1,1,0,
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,1,0,0,0,0,1,0,0,0,0,3,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,
2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,3,0,0,1,1,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,0,1,2,0,1,2,0,0,1,1,0,2,0,1,0,0,1,0,0,0,0,1,0,0,0,2,0,0,0,0,
1,0,0,1,0,1,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,0,0,0,0,0,0,0,1,1,0,1,1,0,2,1,3,0,0,0,0,1,1,0,0,0,0,0,0,0,3,
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,1,0,1,0,0,2,0,0,2,0,0,1,1,2,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,0,
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,1,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,3,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,
1,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,1,0,0,2,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};
const SequenceModel TIS620ThaiModel =
{
TIS620CharToOrderMap,
Tis_620_CharToOrderMap,
ThaiLangModel,
64,
(float)0.926386,
(float)0.8815720594354438,
PR_FALSE,
"TIS-620"
};
const SequenceModel Iso_8859_11ThaiModel =
{
Iso_8859_11_CharToOrderMap,
ThaiLangModel,
64,
(float)0.8815720594354438,
PR_FALSE,
"ISO-8859-11"
};

View File

@ -52,46 +52,51 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel);
mProbers[4] = new nsSingleByteCharSetProber(&Ibm866CyrillicModel);
mProbers[5] = new nsSingleByteCharSetProber(&Ibm855CyrillicModel);
mProbers[6] = new nsSingleByteCharSetProber(&Latin7GreekModel);
mProbers[7] = new nsSingleByteCharSetProber(&Win1253GreekModel);
mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
mProbers[10] = new nsSingleByteCharSetProber(&TIS620ThaiModel);
nsHebrewProber *hebprober = new nsHebrewProber();
// Notice: Any change in these indexes - 10,11,12 must be reflected
// in the code below as well.
mProbers[11] = hebprober;
mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew
mProbers[13] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew
mProbers[10] = hebprober;
mProbers[11] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew
mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew
// Tell the Hebrew prober about the logical and visual probers
if (mProbers[11] && mProbers[12] && mProbers[13]) // all are not null
if (mProbers[10] && mProbers[11] && mProbers[12]) // all are not null
{
hebprober->SetModelProbers(mProbers[12], mProbers[13]);
hebprober->SetModelProbers(mProbers[11], mProbers[12]);
}
else // One or more is null. avoid any Hebrew probing, null them all
{
for (PRUint32 i = 11; i <= 13; ++i)
{
delete mProbers[i];
mProbers[i] = 0;
for (PRUint32 i = 10; i <= 12; ++i)
{
delete mProbers[i];
mProbers[i] = 0;
}
}
mProbers[14] = new nsSingleByteCharSetProber(&Iso_8859_1FrenchModel);
mProbers[15] = new nsSingleByteCharSetProber(&Iso_8859_15FrenchModel);
mProbers[16] = new nsSingleByteCharSetProber(&Windows_1252FrenchModel);
mProbers[13] = new nsSingleByteCharSetProber(&Tis_620ThaiModel);
mProbers[14] = new nsSingleByteCharSetProber(&Iso_8859_11ThaiModel);
mProbers[17] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
mProbers[18] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
mProbers[15] = new nsSingleByteCharSetProber(&Iso_8859_1FrenchModel);
mProbers[16] = new nsSingleByteCharSetProber(&Iso_8859_15FrenchModel);
mProbers[17] = new nsSingleByteCharSetProber(&Windows_1252FrenchModel);
mProbers[19] = new nsSingleByteCharSetProber(&Iso_8859_1GermanModel);
mProbers[20] = new nsSingleByteCharSetProber(&Windows_1252GermanModel);
mProbers[18] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
mProbers[19] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
mProbers[21] = new nsSingleByteCharSetProber(&Iso_8859_3EsperantoModel);
mProbers[20] = new nsSingleByteCharSetProber(&Iso_8859_1GermanModel);
mProbers[21] = new nsSingleByteCharSetProber(&Windows_1252GermanModel);
mProbers[22] = new nsSingleByteCharSetProber(&Iso_8859_3EsperantoModel);
mProbers[23] = new nsSingleByteCharSetProber(&Iso_8859_3TurkishModel);
mProbers[24] = new nsSingleByteCharSetProber(&Iso_8859_9TurkishModel);
mProbers[22] = new nsSingleByteCharSetProber(&Iso_8859_3TurkishModel);
mProbers[23] = new nsSingleByteCharSetProber(&Iso_8859_9TurkishModel);
Reset();
}

View File

@ -40,7 +40,7 @@
#define nsSBCSGroupProber_h__
#define NUM_OF_SBCS_PROBERS 24
#define NUM_OF_SBCS_PROBERS 25
class nsCharSetProber;
class nsSBCSGroupProber: public nsCharSetProber {

View File

@ -141,7 +141,8 @@ extern const SequenceModel Win1250HungarianModel;
extern const SequenceModel Win1255Model;
extern const SequenceModel TIS620ThaiModel;
extern const SequenceModel Tis_620ThaiModel;
extern const SequenceModel Iso_8859_11ThaiModel;
extern const SequenceModel Iso_8859_15FrenchModel;
extern const SequenceModel Iso_8859_1FrenchModel;

5
test/th/iso-8859-11.txt Normal file
View File

@ -0,0 +1,5 @@
TIS-620
 ÁҾðҚźĹÔľŔŃłąěÍŘľĘŇËĄĂĂÁ 620-2533, ÁÍĄ.620-2533, ËĂ×͡ŐčĂŮé¨ŃĄĄŃšˇŃčÇäťÇčŇ TIS-620 ŕťçšŞŘ´ÍŃĄ˘ĂĐÁҾðҚÍŘľĘŇËĄĂĂÁ˘Í§äˇÂ ÁŐŞ×čÍŕľçÁÇčŇ ĂËŃĘĘÓËĂŃşÍŃĄ˘ĂĐäˇÂˇŐčăŞéĄŃş¤ÍÁžÔÇŕľÍĂě
ĂËŃĘ TIS-620 ÁŐĂŇÂĹĐŕÍŐ´¤ĹéŇÂĂËŃĘ ISO-8859-11 ÁŇĄ ᾥľčҧĄŃšá¤čŕžŐ§ˇŐč ISO-8859-11 ĄÓËš´ăËé A0 ŕťçš "ŕÇéšÇĂäẺäÁčľŃ´¤Ó" (no-break space) ĘčÇš TIS-620 šŃéšáÁé¨ĐʧǚľÓáËšč§ A0 ŕÍŇäÇé áľčĄçäÁčä´éĄÓËš´¤čŇă´ ć ăËé