LangModels: add Arabic support.

Models constructed for ISO-8859-6 and Windows-1256.
This commit is contained in:
Jehan 2015-12-13 18:42:16 +01:00
parent ad2f7212e2
commit 9c3c37517c
10 changed files with 632 additions and 1 deletions

View File

@ -14,6 +14,9 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj
* UTF-8
* UTF-16BE / UTF-16LE
* UTF-32BE / UTF-32LE / X-ISO-10646-UCS-4-34121 / X-ISO-10646-UCS-4-21431
* Arabic
* ISO-8859-6
* WINDOWS-1256
* Bulgarian
* ISO-8859-5
* WINDOWS-1251

View File

@ -0,0 +1,142 @@
= Logs of language model for Arabic (ar) =
- Generated by BuildLangModel.py
- Started: 2015-12-13 18:31:12.817808
- Maximum depth: 2
- Max number of pages: 50
== Parsed pages ==
الصفحة_الرئيسية (revision 17217037)
11 ديسمبر (revision 17699159)
12 ديسمبر (revision 17710194)
13 ديسمبر (revision 17722318)
1437 هـ (revision 17278274)
14 ديسمبر (revision 17432010)
15 ديسمبر (revision 17206233)
1645 (revision 17168144)
1954 (revision 17409780)
1955 (revision 16826533)
1972 (revision 17004868)
1988 (revision 17671285)
2003 (revision 17656994)
2011 (revision 17589601)
2015 (revision 17678287)
216 ق.م (revision 17586752)
25 يناير (revision 17325864)
2 ربيع الأول (revision 17722146)
6 (عدد) (revision 16972178)
آريز (revision 17466671)
آلهة اليونان (revision 17722617)
أثينا (revision 17642941)
أثينا (ميثولوجيا) (revision 17662932)
أزمة المهاجرين إلى أوروبا (revision 17718437)
أوروبا (revision 17713457)
إس سي إي سانتا مونيكا ستوديو (revision 17035439)
إسبارطة (revision 16733170)
إسماعيل الصفوي (revision 17194218)
إله الحرب (لعبة فيديو) (revision 17630201)
إمارة دبي (revision 17602037)
إيطاليا (revision 17586853)
اتفاق باريس (revision 17718086)
الأزمة الليبية (revision 17630232)
الإمارات العربية المتحدة (revision 17722077)
الإنتخابات البلدية السعودية 2015 (revision 17722004)
الاتحاد الأوروبي لكرة القدم (revision 17596822)
الاحتجاجات اللبنانية 2015 (revision 17315127)
الانتفاضة الفلسطينية (2015) (revision 17710414)
التمرد العراقي (revision 17708640)
الجمعية العامة للأمم المتحدة (revision 17304227)
الجمهورية الرومانية (revision 16472557)
الجيش اللبناني (revision 17516533)
الحرب الأهلية السورية (revision 17675300)
الحرب الأهلية اليمنية (2015) (revision 17686236)
الحرب في شمال غرب باكستان (revision 17490838)
الدولة الصفوية (revision 17031046)
الرياض (revision 17580586)
السعودية (revision 17711339)
السلطة الوطنية الفلسطينية (revision 17438123)
العراق (revision 17704602)
العلاقات الخارجية في تركيا (revision 17647409)
== End of Parsed pages ==
- Wikipedia parsing ended at: 2015-12-13 18:33:58.846891
95 characters appeared 727795 times.
First 64 characters:
[ 0] Char ا: 14.933875610577156 %
[ 1] Char ل: 11.460782225764122 %
[ 2] Char ي: 8.30302489025069 %
[ 3] Char م: 6.3702003998378665 %
[ 4] Char و: 5.952637762007158 %
[ 5] Char ر: 4.9419135883043985 %
[ 6] Char ن: 4.900967992360486 %
[ 7] Char ت: 4.229625100474721 %
[ 8] Char ة: 3.6022506337636284 %
[ 9] Char ب: 3.5434428650925054 %
[10] Char ع: 3.3116468236247845 %
[11] Char د: 3.1756195082406444 %
[12] Char س: 2.5401383631379715 %
[13] Char ف: 2.3899587109007343 %
[14] Char ق: 2.010868445097864 %
[15] Char أ: 1.8763525443291036 %
[16] Char ه: 1.8663222473361318 %
[17] Char ك: 1.8573911609725264 %
[18] Char ح: 1.8431014227907585 %
[19] Char ج: 1.3270220323030524 %
[20] Char ط: 1.0305099650313618 %
[21] Char ش: 0.9638703206260004 %
[22] Char إ: 0.8946200509758929 %
[23] Char ص: 0.8509264284585631 %
[24] Char ى: 0.7726076711161797 %
[25] Char خ: 0.717097534333157 %
[26] Char ز: 0.6687322666410184 %
[27] Char ث: 0.6549921337739336 %
[28] Char ض: 0.5409490309771295 %
[29] Char غ: 0.4574090231452538 %
[30] Char ذ: 0.44765352880962356 %
[31] Char ئ: 0.39269299734128427 %
[32] Char ء: 0.295138053984982 %
[33] Char ظ: 0.2397653185306302 %
[34] Char آ: 0.12324899181775088 %
[35] Char ؤ: 0.08491402111858422 %
[36] Char ـ: 0.047678261048784344 %
[37] Char a: 0.03311372020967443 %
[38] Char e: 0.029403884335561525 %
[39] Char i: 0.027205463076827956 %
[40] Char o: 0.02432003517474014 %
[41] Char t: 0.02349562720271505 %
[42] Char r: 0.02294602188803166 %
[43] Char n: 0.020472797971956388 %
[44] Char s: 0.01799957405588112 %
[45] Char l: 0.012915724895059736 %
[46] Char h: 0.011816514265692949 %
[47] Char d: 0.011129507622338709 %
[48] Char پ: 0.010717303636326163 %
[49] Char c: 0.009480691678288529 %
[50] Char u: 0.007969277062909199 %
[51] Char m: 0.007694474405567502 %
[52] Char A: 0.006870066433542411 %
[53] Char گ: 0.006595263776200715 %
[54] Char f: 0.006183059790188171 %
[55] Char S: 0.005770855804175626 %
[56] Char y: 0.0054960531468339294 %
[57] Char T: 0.0049464478321505365 %
[58] Char b: 0.0048090465034796885 %
[59] Char G: 0.0046716451748088405 %
[60] Char I: 0.004396842517467144 %
[61] Char C: 0.0042594411887962955 %
[62] Char p: 0.0039846385314545995 %
[63] Char k: 0.003709835874112903 %
The first 64 characters have an accumulated ratio of 0.999523217389512.
1479 sequences found.
First 512 (typical positive ratio): 0.9696025116913417
Next 512 (512-1024): 1.3740132867084825e-06
Rest: 0.0012305764497782395
- Processing end: 2015-12-13 18:33:59.193909

View File

@ -0,0 +1,73 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# ##### BEGIN LICENSE BLOCK #####
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Jehan <jehan@girinstud.io>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ##### END LICENSE BLOCK #####
from codepoints import *
name = 'ISO-8859-6'
aliases = ['ISO_8859-6:1987', 'ISO_8859-6', 'iso-ir-127',
'ECMA-114', 'ASMO-708', 'arabic', 'csISOLatinArabic']
language = \
{
# Dedicated to Arabic.
'complete': [ 'ar' ],
'incomplete': []
}
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
charmap = \
[
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
SYM,ILL,ILL,ILL,SYM,ILL,ILL,ILL,ILL,ILL,ILL,ILL,SYM,SYM,ILL,ILL, # AX
ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,SYM,ILL,ILL,ILL,SYM, # BX
ILL,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,ILL,ILL,ILL,ILL,ILL, # DX
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # EX
SYM,SYM,SYM,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL, # FX
]

View File

@ -0,0 +1,75 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# ##### BEGIN LICENSE BLOCK #####
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Jehan <jehan@girinstud.io>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ##### END LICENSE BLOCK #####
from codepoints import *
name = 'WINDOWS-1256'
aliases = ['cswindows1256']
language = \
{
# Dedicated to Arabic (and possibly some other languages that use Arabic
# script, like Persian and Urdu).
# Also contains some French characters for colonial historic reasons
# (upper-case letters with diacritics were not included).
'complete': ['ar', 'fr', 'fa', 'ur'],
'incomplete': []
}
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
charmap = \
[
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,LET,LET,LET, # 8X
LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,SYM,LET,SYM,SYM,LET, # 9X
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM, # AX
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,LET,LET,SYM,SYM,LET, # FX
]

67
script/langs/ar.py Normal file
View File

@ -0,0 +1,67 @@
#!/bin/python3
# -*- coding: utf-8 -*-
# ##### BEGIN LICENSE BLOCK #####
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Jehan <jehan@girinstud.io>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ##### END LICENSE BLOCK #####
import re
## Mandatory Properties ##
name = 'Arabic'
code = 'ar'
use_ascii = False
charsets = ['ISO-8859-6', 'WINDOWS-1256']
## Optional Properties ##
# No alphabet. Arabic is complicated because letters have different
# forms (glyphs) depending on positions. Some charsets would encode
# glyphs while others would encode only the forms. In doubt, I will
# just let the defaults for now.
start_pages = ['الصفحة_الرئيسية']
wikipedia_code = code
case_mapping = False
# A function to clean content returned by the `wikipedia` python lib,
# in case some unwanted data has been overlooked.
def clean_wikipedia_content(content):
cleaned = re.sub(r'(=+) *([^=]+) *\1',
r'\2',
content)
return cleaned

View File

@ -8,6 +8,7 @@ set(
UCHARDET_SOURCES
CharDistribution.cpp
JpCntx.cpp
LangModels/LangArabicModel.cpp
LangModels/LangBulgarianModel.cpp
LangModels/LangRussianModel.cpp
LangModels/LangEsperantoModel.cpp

View File

@ -0,0 +1,265 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Communicator client code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
/********* Language model for: Arabic *********/
/**
* Generated by BuildLangModel.py
* On: 2015-12-13 18:33:58.848027
**/
/* Character Mapping Table:
* ILL: illegal character.
* CTR: control character specific to the charset.
* RET: carriage/return.
* SYM: symbol (punctuation) that does not belong to word.
* NUM: 0 - 9.
*
* Other characters are ordered by probabilities
* (0 is the most common character in the language).
*
* Orders are generic to a language. So the codepoint with order X in
* CHARSET1 maps to the same character as the codepoint with the same
* order X in CHARSET2 for the same language.
* As such, it is possible to get missing order. For instance the
* ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
* even though they are both used for French. Same for the euro sign.
*/
static const unsigned char Iso_8859_6_CharToOrderMap[] =
{
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 52, 72, 61, 68, 74, 69, 59, 78, 60, 90, 86, 67, 65, 71, 75, /* 4X */
64, 85, 76, 55, 57, 79, 81, 70, 82, 87, 91,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 37, 58, 49, 47, 38, 54, 66, 46, 39, 88, 63, 45, 51, 43, 40, /* 6X */
62, 89, 42, 44, 41, 50, 77, 73, 83, 56, 80,SYM,SYM,SYM,SYM,CTR, /* 7X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
SYM,ILL,ILL,ILL,SYM,ILL,ILL,ILL,ILL,ILL,ILL,ILL,SYM,SYM,ILL,ILL, /* AX */
ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,SYM,ILL,ILL,ILL,SYM, /* BX */
ILL, 32, 34, 15, 35, 22, 31, 0, 9, 8, 7, 27, 19, 18, 25, 11, /* CX */
30, 5, 26, 12, 21, 23, 28,SYM, 33, 10, 29,ILL,ILL,ILL,ILL,ILL, /* DX */
36, 13, 14, 17, 1, 3, 6, 16, 4, 24, 2,SYM,SYM,SYM,SYM,SYM, /* EX */
SYM,SYM,SYM,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
static const unsigned char Windows_1256_CharToOrderMap[] =
{
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 52, 72, 61, 68, 74, 69, 59, 78, 60, 90, 86, 67, 65, 71, 75, /* 4X */
64, 85, 76, 55, 57, 79, 81, 70, 82, 87, 91,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 37, 58, 49, 47, 38, 54, 66, 46, 39, 88, 63, 45, 51, 43, 40, /* 6X */
62, 89, 42, 44, 41, 50, 77, 73, 83, 56, 80,SYM,SYM,SYM,SYM,CTR, /* 7X */
SYM, 48,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 95,SYM, 96, 92, 97, 98, /* 8X */
53,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 84,SYM, 99,SYM,100,SYM,SYM,101, /* 9X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,102,SYM,SYM,SYM,SYM,SYM, /* AX */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
103, 32, 34, 15, 35, 22, 31, 0, 9, 8, 7, 27, 19, 18, 25, 11, /* CX */
30, 5, 26, 12, 21, 23, 28,SYM, 20, 33, 10, 29, 36, 13, 14, 17, /* DX */
104, 1, 93, 3, 6, 16, 4,105,106, 94,107,108, 24, 2,109,110, /* EX */
SYM,SYM,SYM,SYM,111,SYM,SYM,SYM,SYM,112,SYM,113,114,SYM,SYM,115, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
/* Model Table:
* Total sequences: 1479
* First 512 sequences: 0.9696025116913417
* Next 512 sequences (512-1024): 0.029166911858880054
* Rest: 0.0012305764497782395
* Negative sequences: TODO
*/
static const PRUint8 ArabicLangModel[] =
{
2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,1,3,1,3,3,3,3,2,2,3,
3,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,
1,2,3,2,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,0,3,1,3,3,3,3,2,2,3,
2,2,0,2,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,1,3,2,3,3,3,2,2,2,2,
0,2,1,3,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,
2,2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,1,2,3,2,3,2,3,3,2,3,
1,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
3,2,3,3,3,2,2,3,3,3,3,3,3,3,3,1,3,2,3,3,3,3,0,3,2,2,3,2,2,2,3,2,
0,3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,2,3,3,2,2,
0,3,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,0,0,0,0,0,0,1,3,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,3,3,2,2,1,2,2,2,2,2,2,2,
1,2,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,0,3,2,0,2,2,3,0,3,2,0,3,3,3,0,2,0,
0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,2,0,2,0,0,3,3,2,3,0,2,0,2,
2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,3,3,2,3,3,1,0,0,2,2,0,1,0,1,0,1,
0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
3,3,3,2,3,3,3,3,3,2,3,2,3,2,3,2,3,2,2,2,2,2,2,2,2,2,2,1,3,2,2,2,
1,3,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,0,2,1,3,2,0,3,2,0,2,0,3,0,2,0,
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,3,3,2,3,3,3,3,3,3,0,3,3,3,3,3,3,0,3,2,3,2,3,2,3,2,2,
0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,2,3,3,1,3,2,1,2,0,2,2,0,3,2,2,0,0,2,0,2,1,2,0,3,0,
0,1,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,2,2,3,2,2,2,2,2,2,2,2,2,2,1,0,2,3,3,0,1,3,0,
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,3,3,2,1,3,3,3,3,0,2,3,0,3,2,2,0,3,2,0,3,2,3,0,2,0,
0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,3,1,2,1,0,1,0,0,1,0,3,2,0,2,2,2,
0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,2,3,3,3,2,3,3,2,2,3,2,3,2,2,0,2,1,2,1,1,0,2,1,0,0,0,1,0,2,
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,2,3,3,3,3,2,3,2,3,3,2,1,2,2,2,3,3,2,2,2,0,0,0,2,3,1,0,0,2,1,2,
0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,1,2,3,2,0,2,3,3,3,2,3,0,2,2,2,3,2,2,0,3,0,2,2,2,3,2,3,1,
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,2,3,3,2,3,0,3,2,0,2,1,3,0,2,0,0,2,2,2,0,0,0,2,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,3,2,3,2,3,2,2,0,0,2,0,0,1,3,2,0,3,0,1,2,0,2,0,2,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,2,3,3,2,2,0,2,2,1,2,2,2,2,0,0,0,0,1,2,2,0,0,1,0,2,
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,2,3,2,2,1,1,2,3,1,2,2,0,0,0,0,0,0,1,0,0,2,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,2,3,2,3,2,0,2,0,1,2,0,2,1,2,0,0,0,2,2,0,0,0,2,0,2,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,2,3,2,1,2,2,2,0,0,2,0,0,2,2,1,0,2,1,0,2,0,2,0,2,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,2,2,2,2,2,2,2,0,0,0,2,2,0,3,3,0,2,0,0,0,0,2,2,0,0,0,0,0,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,2,3,2,2,3,2,2,2,2,2,2,0,2,2,2,2,2,2,0,1,0,1,2,0,1,1,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,1,1,1,0,0,2,2,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,2,2,3,2,2,1,2,3,2,0,0,0,2,0,0,3,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,3,2,2,2,3,2,2,0,2,0,2,2,2,2,0,1,2,1,1,0,2,0,1,0,3,1,2,0,1,2,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,2,3,2,1,2,1,1,0,2,2,0,2,0,2,2,0,0,0,2,0,0,2,2,1,2,0,0,0,0,
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,1,0,1,1,1,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,
0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,2,2,1,2,2,2,2,2,1,2,0,2,1,2,0,0,1,0,1,0,1,0,0,0,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,2,2,1,1,2,2,2,2,2,0,2,0,2,1,2,0,0,1,0,0,0,2,0,0,0,1,2,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,2,2,1,2,2,2,2,2,2,0,2,0,2,1,2,0,0,1,0,0,0,1,0,0,0,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,1,0,1,2,2,2,2,2,1,1,2,0,2,2,2,0,0,2,0,0,0,1,0,0,0,2,2,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,2,2,2,2,2,2,1,2,2,2,1,0,1,1,1,0,0,0,0,2,0,2,0,0,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,2,2,2,2,2,1,2,2,2,0,1,0,2,1,2,0,0,0,0,2,0,1,0,0,0,0,2,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,2,2,2,2,2,0,1,2,1,1,2,0,2,1,0,0,0,1,0,1,0,0,0,0,0,0,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,2,2,2,1,2,0,0,2,1,2,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,2,2,2,2,1,0,0,1,2,0,2,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,2,2,2,2,2,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,2,2,2,1,1,1,1,1,0,1,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,
2,2,1,0,2,2,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,2,2,2,2,2,1,0,0,1,2,0,0,1,2,0,0,0,0,0,1,0,0,0,0,0,0,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,1,1,1,0,2,2,2,2,1,0,2,0,1,0,2,0,0,0,0,0,0,2,0,0,0,0,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,2,2,2,2,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,2,0,0,0,0,0,1,0,
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,2,2,2,1,1,0,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,1,
2,2,2,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,2,1,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,2,1,0,2,2,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,1,0,0,1,2,2,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,1,0,1,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,2,1,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,2,0,1,1,0,1,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,2,0,0,2,0,2,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,1,0,2,1,1,0,0,0,0,0,0,1,0,0,2,0,1,0,2,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,2,1,0,1,0,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,
};
const SequenceModel Iso_8859_6ArabicModel =
{
Iso_8859_6_CharToOrderMap,
ArabicLangModel,
64,
(float)0.9696025116913417,
PR_FALSE,
"ISO-8859-6"
};
const SequenceModel Windows_1256ArabicModel =
{
Windows_1256_CharToOrderMap,
ArabicLangModel,
64,
(float)0.9696025116913417,
PR_FALSE,
"WINDOWS-1256"
};

View File

@ -101,6 +101,9 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[26] = new nsSingleByteCharSetProber(&Iso_8859_3TurkishModel);
mProbers[27] = new nsSingleByteCharSetProber(&Iso_8859_9TurkishModel);
mProbers[28] = new nsSingleByteCharSetProber(&Iso_8859_6ArabicModel);
mProbers[29] = new nsSingleByteCharSetProber(&Windows_1256ArabicModel);
Reset();
}

View File

@ -40,7 +40,7 @@
#define nsSBCSGroupProber_h__
#define NUM_OF_SBCS_PROBERS 28
#define NUM_OF_SBCS_PROBERS 30
class nsCharSetProber;
class nsSBCSGroupProber: public nsCharSetProber {

View File

@ -122,6 +122,8 @@ protected:
};
extern const SequenceModel Windows_1256ArabicModel;
extern const SequenceModel Iso_8859_6ArabicModel;
extern const SequenceModel Koi8rRussianModel;
extern const SequenceModel Win1251RussianModel;