mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 08:46:40 +08:00
script, src: add English language model.
English detection is still quite crappy so I don't add a unit test yet. Though I believe the detection being bad is mostly because of too much shortcutting we are doing to go "fast". I should probably review this whole part of the logics as well.
This commit is contained in:
parent
bed459c6e7
commit
bfa4b10d4d
181
script/BuildLangModelLogs/LangEnglishModel.log
Normal file
181
script/BuildLangModelLogs/LangEnglishModel.log
Normal file
@ -0,0 +1,181 @@
|
||||
= Logs of language model for English (en) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2021-03-19 23:26:14.143096
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 100
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Marmot (revision 1000529225)
|
||||
Alashan ground squirrel (revision 1010437381)
|
||||
Alaska (revision 1012870556)
|
||||
Alaska marmot (revision 1010409368)
|
||||
Allen's chipmunk (revision 1010890232)
|
||||
Alpine chipmunk (revision 1010409470)
|
||||
Alpine marmot (revision 1012720679)
|
||||
Alps (revision 1007908369)
|
||||
Altai Mountains (revision 1006577543)
|
||||
Ancient Greece (revision 1012778875)
|
||||
Animal (revision 1013060732)
|
||||
Animal Diversity Web (revision 996899740)
|
||||
Antelope squirrel (revision 1010441265)
|
||||
Apennine Mountains (revision 1009656710)
|
||||
Arctic ground squirrel (revision 1010409925)
|
||||
Asia Minor ground squirrel (revision 1010437585)
|
||||
BNF (identifier) (revision 1010501260)
|
||||
Baja California rock squirrel (revision 1010410301)
|
||||
Barcode of Life Data System (revision 997241036)
|
||||
Bat (revision 1012442106)
|
||||
Bear (revision 1012937821)
|
||||
Belding's ground squirrel (revision 1010410588)
|
||||
Bibcode (identifier) (revision 1009103296)
|
||||
Black-capped marmot (revision 992988317)
|
||||
Black-tailed prairie dog (revision 1010411000)
|
||||
Black Hills (revision 1011995885)
|
||||
Bobak marmot (revision 1010411082)
|
||||
Brokpa (revision 1001820104)
|
||||
Brooks Range (revision 1009930357)
|
||||
Buller's chipmunk (revision 1010411572)
|
||||
California chipmunk (revision 1010411807)
|
||||
California ground squirrel (revision 1010411812)
|
||||
Callospermophilus (revision 1010416079)
|
||||
Carpathian Mountains (revision 1011395807)
|
||||
Cascade Range (revision 1011474213)
|
||||
Cascade golden-mantled ground squirrel (revision 1010416079)
|
||||
Chordate (revision 1008964469)
|
||||
Cliff chipmunk (revision 1010412814)
|
||||
Colorado chipmunk (revision 1010412919)
|
||||
Daurian ground squirrel (revision 1010413422)
|
||||
Deosai National Park (revision 1006913741)
|
||||
Doi (identifier) (revision 1010427488)
|
||||
Durango chipmunk (revision 1010413819)
|
||||
EPPO Code (revision 998151320)
|
||||
Eastern chipmunk (revision 999177830)
|
||||
Encyclopedia of Life (revision 994178741)
|
||||
Espíritu Santo antelope squirrel (revision 1010414324)
|
||||
Ethnology (revision 1011057083)
|
||||
Eulipotyphla (revision 1012652578)
|
||||
Eurasian Steppe (revision 1013064344)
|
||||
European ground squirrel (revision 1010414381)
|
||||
Eutamias (revision 1010406609)
|
||||
Extinction (revision 1011028396)
|
||||
Fauna Europaea (revision 963073975)
|
||||
Flower (revision 1010385350)
|
||||
Forest-steppe marmot (revision 1010436539)
|
||||
Forrest's rock squirrel (revision 1010437668)
|
||||
France (revision 1012524494)
|
||||
Franklin's ground squirrel (revision 1010415067)
|
||||
French Alps (revision 1006041101)
|
||||
GND (identifier) (revision 1010440981)
|
||||
Gallo-Romance languages (revision 1012668074)
|
||||
Genus (revision 1007184632)
|
||||
Global Biodiversity Information Facility (revision 1010489511)
|
||||
Gold (revision 1012856700)
|
||||
Gold-digging ant (revision 1007959560)
|
||||
Golden-mantled ground squirrel (revision 1010416079)
|
||||
Gray-collared chipmunk (revision 1010416642)
|
||||
Gray-footed chipmunk (revision 1010416658)
|
||||
Gray marmot (revision 1010416479)
|
||||
Ground squirrel (revision 1010442953)
|
||||
Groundhog Day (revision 1012802985)
|
||||
Gunnison's prairie dog (revision 1010416998)
|
||||
Harris's antelope squirrel (revision 1010417210)
|
||||
Herbivore (revision 1006902225)
|
||||
Herodotus (revision 1012927818)
|
||||
Hibernate (revision 1009048926)
|
||||
Hibernation (revision 1009048926)
|
||||
Himalayan marmot (revision 1010417424)
|
||||
Hoary marmot (revision 1010417525)
|
||||
Hopi chipmunk (revision 1010417623)
|
||||
INaturalist (revision 1009815294)
|
||||
ISBN (identifier) (revision 1009586768)
|
||||
Ictidomys (revision 1010406819)
|
||||
Ictidomys parvidens (revision 1010426310)
|
||||
Integrated Taxonomic Information System (revision 999235988)
|
||||
Interim Register of Marine and Nonmarine Genera (revision 995182351)
|
||||
JSTOR (identifier) (revision 1011078319)
|
||||
Jacopo Ligozzi (revision 1006687935)
|
||||
Johann Friedrich Blumenbach (revision 1006564504)
|
||||
Kazakhstan (revision 1012748504)
|
||||
LCCN (identifier) (revision 1006934344)
|
||||
Ladakh (revision 1010799326)
|
||||
Latin (revision 1012971392)
|
||||
Least chipmunk (revision 1010419221)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2021-03-19 23:29:33.380471
|
||||
|
||||
59 characters appeared 59 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char m: 1.694915254237288 %
|
||||
[ 1] Char a: 1.694915254237288 %
|
||||
[ 2] Char r: 1.694915254237288 %
|
||||
[ 3] Char o: 1.694915254237288 %
|
||||
[ 4] Char t: 1.694915254237288 %
|
||||
[ 5] Char s: 1.694915254237288 %
|
||||
[ 6] Char e: 1.694915254237288 %
|
||||
[ 7] Char l: 1.694915254237288 %
|
||||
[ 8] Char i: 1.694915254237288 %
|
||||
[ 9] Char v: 1.694915254237288 %
|
||||
[10] Char y: 1.694915254237288 %
|
||||
[11] Char g: 1.694915254237288 %
|
||||
[12] Char u: 1.694915254237288 %
|
||||
[13] Char n: 1.694915254237288 %
|
||||
[14] Char d: 1.694915254237288 %
|
||||
[15] Char q: 1.694915254237288 %
|
||||
[16] Char h: 1.694915254237288 %
|
||||
[17] Char w: 1.694915254237288 %
|
||||
[18] Char p: 1.694915254237288 %
|
||||
[19] Char c: 1.694915254237288 %
|
||||
[20] Char b: 1.694915254237288 %
|
||||
[21] Char f: 1.694915254237288 %
|
||||
[22] Char k: 1.694915254237288 %
|
||||
[23] Char x: 1.694915254237288 %
|
||||
[24] Char z: 1.694915254237288 %
|
||||
[25] Char j: 1.694915254237288 %
|
||||
[26] Char á: 1.694915254237288 %
|
||||
[27] Char ö: 1.694915254237288 %
|
||||
[28] Char ä: 1.694915254237288 %
|
||||
[29] Char í: 1.694915254237288 %
|
||||
[30] Char ç: 1.694915254237288 %
|
||||
[31] Char ô: 1.694915254237288 %
|
||||
[32] Char à: 1.694915254237288 %
|
||||
[33] Char ü: 1.694915254237288 %
|
||||
[34] Char æ: 1.694915254237288 %
|
||||
[35] Char é: 1.694915254237288 %
|
||||
[36] Char ï: 1.694915254237288 %
|
||||
[37] Char û: 1.694915254237288 %
|
||||
[38] Char ó: 1.694915254237288 %
|
||||
[39] Char µ: 1.694915254237288 %
|
||||
[40] Char è: 1.694915254237288 %
|
||||
[41] Char ì: 1.694915254237288 %
|
||||
[42] Char î: 1.694915254237288 %
|
||||
[43] Char ë: 1.694915254237288 %
|
||||
[44] Char ð: 1.694915254237288 %
|
||||
[45] Char ý: 1.694915254237288 %
|
||||
[46] Char š: 1.694915254237288 %
|
||||
[47] Char ñ: 1.694915254237288 %
|
||||
[48] Char œ: 1.694915254237288 %
|
||||
[49] Char ê: 1.694915254237288 %
|
||||
[50] Char â: 1.694915254237288 %
|
||||
[51] Char ø: 1.694915254237288 %
|
||||
[52] Char þ: 1.694915254237288 %
|
||||
[53] Char å: 1.694915254237288 %
|
||||
[54] Char ß: 1.694915254237288 %
|
||||
[55] Char ã: 1.694915254237288 %
|
||||
[56] Char ž: 1.694915254237288 %
|
||||
[57] Char õ: 1.694915254237288 %
|
||||
[58] Char ú: 1.694915254237288 %
|
||||
|
||||
The first 59 characters have an accumulated ratio of 0.9999999999999989.
|
||||
|
||||
920 sequences found.
|
||||
|
||||
First 378 (typical positive ratio): 0.9950109024233114
|
||||
Next 182 (560-378): 0.003993012537786833
|
||||
Rest: 0.000996085038901806
|
||||
|
||||
- Processing end: 2021-03-19 23:29:33.474226
|
||||
64
script/langs/en.py
Normal file
64
script/langs/en.py
Normal file
@ -0,0 +1,64 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import re
|
||||
|
||||
## Mandatory Properties ##
|
||||
|
||||
# The human name for the language, in English.
|
||||
name = 'English'
|
||||
# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
|
||||
# or use another catalog as a last resort.
|
||||
code = 'en'
|
||||
# ASCII characters are also used in French.
|
||||
use_ascii = True
|
||||
# The charsets we want to support and create data for.
|
||||
charsets = ['ISO-8859-1', 'WINDOWS-1252']
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
# The start page. Though optional, it is advised to choose one yourself.
|
||||
start_pages = ['Marmot']
|
||||
# give possibility to select another code for the Wikipedia URL.
|
||||
wikipedia_code = code
|
||||
# 'a' and 'A' will be considered the same character, and so on.
|
||||
# This uses Python algorithm to determine upper/lower-case of a given
|
||||
# character.
|
||||
case_mapping = True
|
||||
@ -11,6 +11,7 @@ set(
|
||||
LangModels/LangBulgarianModel.cpp
|
||||
LangModels/LangCroatianModel.cpp
|
||||
LangModels/LangCzechModel.cpp
|
||||
LangModels/LangEnglishModel.cpp
|
||||
LangModels/LangEsperantoModel.cpp
|
||||
LangModels/LangEstonianModel.cpp
|
||||
LangModels/LangFinnishModel.cpp
|
||||
|
||||
289
src/LangModels/LangEnglishModel.cpp
Normal file
289
src/LangModels/LangEnglishModel.cpp
Normal file
@ -0,0 +1,289 @@
|
||||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Communicator client code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#include "../nsSBCharSetProber.h"
|
||||
#include "../nsLanguageDetector.h"
|
||||
|
||||
/********* Language model for: English *********/
|
||||
|
||||
/**
|
||||
* Generated by BuildLangModel.py
|
||||
* On: 2021-03-19 23:29:33.380823
|
||||
**/
|
||||
|
||||
/* Character Mapping Table:
|
||||
* ILL: illegal character.
|
||||
* CTR: control character specific to the charset.
|
||||
* RET: carriage/return.
|
||||
* SYM: symbol (punctuation) that does not belong to word.
|
||||
* NUM: 0 - 9.
|
||||
*
|
||||
* Other characters are ordered by probabilities
|
||||
* (0 is the most common character in the language).
|
||||
*
|
||||
* Orders are generic to a language. So the codepoint with order X in
|
||||
* CHARSET1 maps to the same character as the codepoint with the same
|
||||
* order X in CHARSET2 for the same language.
|
||||
* As such, it is possible to get missing order. For instance the
|
||||
* ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
|
||||
* even though they are both used for French. Same for the euro sign.
|
||||
*/
|
||||
static const unsigned char Iso_8859_1_CharToOrderMap[] =
|
||||
{
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||
SYM, 1, 20, 19, 14, 6, 21, 11, 16, 8, 25, 22, 7, 0, 13, 3, /* 4X */
|
||||
18, 15, 2, 5, 4, 12, 9, 17, 23, 10, 24,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||
SYM, 1, 20, 19, 14, 6, 21, 11, 16, 8, 25, 22, 7, 0, 13, 3, /* 6X */
|
||||
18, 15, 2, 5, 4, 12, 9, 17, 23, 10, 24,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||
SYM,SYM,SYM,SYM,SYM, 39,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
|
||||
32, 26, 50, 55, 28, 53, 34, 30, 40, 35, 49, 43, 41, 29, 42, 36, /* CX */
|
||||
44, 47, 59, 38, 31, 57, 27,SYM, 51, 60, 58, 37, 33, 45, 52, 54, /* DX */
|
||||
32, 26, 50, 55, 28, 53, 34, 30, 40, 35, 49, 43, 41, 29, 42, 36, /* EX */
|
||||
44, 47, 61, 38, 31, 57, 27,SYM, 51, 62, 58, 37, 33, 45, 52, 63, /* FX */
|
||||
};
|
||||
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||
|
||||
static const unsigned char Windows_1252_CharToOrderMap[] =
|
||||
{
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||
SYM, 1, 20, 19, 14, 6, 21, 11, 16, 8, 25, 22, 7, 0, 13, 3, /* 4X */
|
||||
18, 15, 2, 5, 4, 12, 9, 17, 23, 10, 24,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||
SYM, 1, 20, 19, 14, 6, 21, 11, 16, 8, 25, 22, 7, 0, 13, 3, /* 6X */
|
||||
18, 15, 2, 5, 4, 12, 9, 17, 23, 10, 24,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||
SYM,ILL,SYM, 64,SYM,SYM,SYM,SYM,SYM,SYM, 46,SYM, 48,ILL, 56,ILL, /* 8X */
|
||||
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 46,SYM, 48,ILL, 56, 65, /* 9X */
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||
SYM,SYM,SYM,SYM,SYM, 39,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
|
||||
32, 26, 50, 55, 28, 53, 34, 30, 40, 35, 49, 43, 41, 29, 42, 36, /* CX */
|
||||
44, 47, 66, 38, 31, 57, 27,SYM, 51, 67, 58, 37, 33, 45, 52, 54, /* DX */
|
||||
32, 26, 50, 55, 28, 53, 34, 30, 40, 35, 49, 43, 41, 29, 42, 36, /* EX */
|
||||
44, 47, 68, 38, 31, 57, 27,SYM, 51, 69, 58, 37, 33, 45, 52, 70, /* FX */
|
||||
};
|
||||
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||
|
||||
static const int Unicode_Char_size = 117;
|
||||
static const unsigned int Unicode_CharOrder[] =
|
||||
{
|
||||
65, 1, 66, 20, 67, 19, 68, 14, 69, 6, 70, 21, 71, 11, 72, 16,
|
||||
73, 8, 74, 25, 75, 22, 76, 7, 77, 0, 78, 13, 79, 3, 80, 18,
|
||||
81, 15, 82, 2, 83, 5, 84, 4, 85, 12, 86, 9, 87, 17, 88, 23,
|
||||
89, 10, 90, 24, 97, 1, 98, 20, 99, 19, 100, 14, 101, 6,102, 21,
|
||||
103, 11, 104, 16, 105, 8, 106, 25, 107, 22, 108, 7, 109, 0,110, 13,
|
||||
111, 3, 112, 18, 113, 15, 114, 2, 115, 5, 116, 4, 117, 12,118, 9,
|
||||
119, 17, 120, 23, 121, 10, 122, 24, 181, 39, 192, 32, 193, 26,194, 50,
|
||||
195, 55, 196, 28, 197, 53, 198, 34, 199, 30, 200, 40, 201, 35,202, 49,
|
||||
203, 43, 204, 41, 205, 29, 206, 42, 207, 36, 208, 44, 209, 47,211, 38,
|
||||
212, 31, 213, 57, 214, 27, 216, 51, 218, 58, 219, 37, 220, 33,221, 45,
|
||||
222, 52, 223, 54, 224, 32, 225, 26, 226, 50, 227, 55, 228, 28,229, 53,
|
||||
230, 34, 231, 30, 232, 40, 233, 35, 234, 49, 235, 43, 236, 41,237, 29,
|
||||
238, 42, 239, 36, 240, 44, 241, 47, 243, 38, 244, 31, 245, 57,246, 27,
|
||||
248, 51, 250, 58, 251, 37, 252, 33, 253, 45, 254, 52, 338, 48,339, 48,
|
||||
352, 46, 353, 46, 381, 56, 382, 56, 924, 39,
|
||||
};
|
||||
|
||||
|
||||
/* Model Table:
|
||||
* Total considered sequences: 920 / 3481
|
||||
* - Positive sequences: first 378 (0.9950109024233114)
|
||||
* - Probable sequences: next 182 (560-378) (0.003993012537786833)
|
||||
* - Neutral sequences: last 2921 (0.000996085038901806)
|
||||
* - Negative sequences: 2561 (off-ratio)
|
||||
* Negative sequences: TODO
|
||||
*/
|
||||
static const PRUint8 EnglishLangModel[] =
|
||||
{
|
||||
3,3,1,3,2,3,3,2,3,1,3,2,3,3,2,1,2,1,3,2,3,2,1,1,1,1,2,1,1,
|
||||
1,0,0,1,1,1,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,
|
||||
3,2,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,0,0,
|
||||
0,1,0,0,0,0,1,1,0,0,0,0,0,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,1,2,1,1,1,1,
|
||||
1,0,1,0,0,0,2,0,1,1,0,1,0,1,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,0,0,
|
||||
0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,2,3,2,3,3,1,2,3,3,2,3,2,2,2,0,3,0,1,0,0,
|
||||
1,0,0,1,1,0,2,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,3,3,1,3,2,3,3,2,3,3,3,3,3,3,3,3,1,1,1,1,1,0,
|
||||
1,0,0,0,0,0,2,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,1,0,0,
|
||||
1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,3,3,3,3,3,3,0,2,1,1,0,1,
|
||||
1,0,1,0,0,0,2,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,2,
|
||||
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,1,3,3,3,3,3,3,3,2,1,0,0,
|
||||
1,0,0,0,0,0,1,0,0,1,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,3,2,3,1,2,3,2,3,2,2,1,2,1,1,0,1,0,1,1,0,0,1,0,0,1,1,0,0,
|
||||
1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,1,2,2,3,3,3,0,1,2,3,3,2,1,2,2,2,1,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,1,3,2,2,1,2,2,1,0,2,1,1,2,0,
|
||||
1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,1,2,1,3,3,3,3,3,3,2,2,1,0,0,
|
||||
1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,2,2,3,1,1,0,
|
||||
2,2,1,0,0,1,2,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,2,3,3,3,3,2,3,3,3,2,3,2,3,3,3,2,2,3,1,1,2,2,0,0,0,
|
||||
1,0,1,0,1,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
|
||||
0,2,0,0,1,1,0,0,2,1,1,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,1,3,2,3,3,2,2,1,3,1,2,2,1,1,0,1,0,1,1,1,
|
||||
1,0,1,0,1,1,2,0,0,1,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,
|
||||
1,3,3,3,3,3,3,3,3,0,2,1,1,3,2,0,3,2,1,2,2,2,2,0,0,0,0,0,0,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,0,3,2,3,1,2,0,3,2,3,2,2,0,2,1,1,0,1,0,0,
|
||||
2,0,1,0,0,2,2,0,0,1,0,1,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,0,3,2,3,2,2,2,3,0,0,3,1,2,3,0,2,0,1,0,0,
|
||||
1,0,1,1,0,0,2,0,1,0,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
2,3,3,3,3,3,3,3,3,1,3,2,3,3,2,0,2,1,2,3,3,2,1,0,1,3,0,0,0,
|
||||
0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,
|
||||
2,3,3,3,3,2,3,3,3,0,3,1,3,1,2,0,0,1,1,2,1,3,1,0,0,1,0,1,0,
|
||||
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,2,3,3,3,3,0,3,2,3,3,1,1,3,2,2,2,2,1,1,0,1,1,1,2,0,
|
||||
0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,
|
||||
1,3,0,3,3,2,3,0,3,2,2,0,3,0,1,1,2,1,3,3,0,2,0,2,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,3,1,3,2,2,3,2,3,1,2,0,2,2,1,0,2,1,0,0,2,0,1,0,2,1,0,0,0,
|
||||
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,2,3,0,3,3,1,3,1,1,0,3,1,0,0,1,1,1,0,0,0,1,0,0,2,1,1,0,
|
||||
0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,1,1,0,1,1,0,2,0,1,0,1,1,2,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,1,0,2,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,2,1,0,2,2,0,1,0,1,0,0,0,2,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,2,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
|
||||
1,0,0,0,2,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,1,0,0,1,0,1,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,1,0,1,1,0,0,0,0,0,0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,1,2,1,2,2,2,2,1,2,0,2,1,2,2,1,0,0,2,2,1,0,0,0,1,1,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,1,0,0,2,0,1,0,1,0,0,0,2,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,2,0,0,1,0,0,0,1,0,1,0,1,1,2,0,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,1,0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,1,0,2,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,0,2,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
|
||||
const SequenceModel Iso_8859_1EnglishModel =
|
||||
{
|
||||
Iso_8859_1_CharToOrderMap,
|
||||
EnglishLangModel,
|
||||
59,
|
||||
(float)0.9990039149610982,
|
||||
PR_TRUE,
|
||||
"ISO-8859-1",
|
||||
"en"
|
||||
};
|
||||
|
||||
const SequenceModel Windows_1252EnglishModel =
|
||||
{
|
||||
Windows_1252_CharToOrderMap,
|
||||
EnglishLangModel,
|
||||
59,
|
||||
(float)0.9990039149610982,
|
||||
PR_TRUE,
|
||||
"WINDOWS-1252",
|
||||
"en"
|
||||
};
|
||||
|
||||
const LanguageModel EnglishModel =
|
||||
{
|
||||
"en",
|
||||
Unicode_CharOrder,
|
||||
117,
|
||||
EnglishLangModel,
|
||||
59,
|
||||
(float)0.9999999999999989,
|
||||
};
|
||||
@ -116,6 +116,7 @@ extern const LanguageModel ArabicModel;
|
||||
extern const LanguageModel CroatianModel;
|
||||
extern const LanguageModel CzechModel;
|
||||
extern const LanguageModel DanishModel;
|
||||
extern const LanguageModel EnglishModel;
|
||||
extern const LanguageModel EsperantoModel;
|
||||
extern const LanguageModel EstonianModel;
|
||||
extern const LanguageModel FinnishModel;
|
||||
|
||||
@ -96,6 +96,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
|
||||
langDetectors[i][j++] = new nsLanguageDetector(&CroatianModel);
|
||||
langDetectors[i][j++] = new nsLanguageDetector(&CzechModel);
|
||||
langDetectors[i][j++] = new nsLanguageDetector(&DanishModel);
|
||||
langDetectors[i][j++] = new nsLanguageDetector(&EnglishModel);
|
||||
langDetectors[i][j++] = new nsLanguageDetector(&EsperantoModel);
|
||||
langDetectors[i][j++] = new nsLanguageDetector(&EstonianModel);
|
||||
langDetectors[i][j++] = new nsLanguageDetector(&FinnishModel);
|
||||
|
||||
@ -49,7 +49,7 @@
|
||||
#include "nsEUCTWProber.h"
|
||||
|
||||
#define NUM_OF_PROBERS 8
|
||||
#define NUM_OF_LANGUAGES 29
|
||||
#define NUM_OF_LANGUAGES 30
|
||||
|
||||
class nsMBCSGroupProber: public nsCharSetProber {
|
||||
public:
|
||||
|
||||
@ -197,6 +197,9 @@ nsSBCSGroupProber::nsSBCSGroupProber()
|
||||
mProbers[103] = new nsSingleByteCharSetProber(&Windows_1252NorwegianModel);
|
||||
mProbers[104] = new nsSingleByteCharSetProber(&Ibm865NorwegianModel);
|
||||
|
||||
mProbers[104] = new nsSingleByteCharSetProber(&Iso_8859_1EnglishModel);
|
||||
mProbers[105] = new nsSingleByteCharSetProber(&Windows_1252EnglishModel);
|
||||
|
||||
Reset();
|
||||
}
|
||||
|
||||
|
||||
@ -40,7 +40,7 @@
|
||||
#define nsSBCSGroupProber_h__
|
||||
|
||||
|
||||
#define NUM_OF_SBCS_PROBERS 105
|
||||
#define NUM_OF_SBCS_PROBERS 106
|
||||
|
||||
class nsCharSetProber;
|
||||
class nsSBCSGroupProber: public nsCharSetProber {
|
||||
|
||||
@ -179,6 +179,9 @@ extern const SequenceModel Iso_8859_1DanishModel;
|
||||
extern const SequenceModel Windows_1252DanishModel;
|
||||
extern const SequenceModel Ibm865DanishModel;
|
||||
|
||||
extern const SequenceModel Iso_8859_1EnglishModel;
|
||||
extern const SequenceModel Windows_1252EnglishModel;
|
||||
|
||||
extern const SequenceModel Iso_8859_13LithuanianModel;
|
||||
extern const SequenceModel Iso_8859_10LithuanianModel;
|
||||
extern const SequenceModel Iso_8859_4LithuanianModel;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user