script, src: add English language model.

English detection is still quite crappy so I don't add a unit test yet.
Though I believe the detection being bad is mostly because of too much
shortcutting we are doing to go "fast". I should probably review this
whole part of the logics as well.
This commit is contained in:
Jehan 2021-05-23 19:33:36 +02:00
parent bed459c6e7
commit bfa4b10d4d
10 changed files with 545 additions and 2 deletions

View File

@ -0,0 +1,181 @@
= Logs of language model for English (en) =
- Generated by BuildLangModel.py
- Started: 2021-03-19 23:26:14.143096
- Maximum depth: 4
- Max number of pages: 100
== Parsed pages ==
Marmot (revision 1000529225)
Alashan ground squirrel (revision 1010437381)
Alaska (revision 1012870556)
Alaska marmot (revision 1010409368)
Allen's chipmunk (revision 1010890232)
Alpine chipmunk (revision 1010409470)
Alpine marmot (revision 1012720679)
Alps (revision 1007908369)
Altai Mountains (revision 1006577543)
Ancient Greece (revision 1012778875)
Animal (revision 1013060732)
Animal Diversity Web (revision 996899740)
Antelope squirrel (revision 1010441265)
Apennine Mountains (revision 1009656710)
Arctic ground squirrel (revision 1010409925)
Asia Minor ground squirrel (revision 1010437585)
BNF (identifier) (revision 1010501260)
Baja California rock squirrel (revision 1010410301)
Barcode of Life Data System (revision 997241036)
Bat (revision 1012442106)
Bear (revision 1012937821)
Belding's ground squirrel (revision 1010410588)
Bibcode (identifier) (revision 1009103296)
Black-capped marmot (revision 992988317)
Black-tailed prairie dog (revision 1010411000)
Black Hills (revision 1011995885)
Bobak marmot (revision 1010411082)
Brokpa (revision 1001820104)
Brooks Range (revision 1009930357)
Buller's chipmunk (revision 1010411572)
California chipmunk (revision 1010411807)
California ground squirrel (revision 1010411812)
Callospermophilus (revision 1010416079)
Carpathian Mountains (revision 1011395807)
Cascade Range (revision 1011474213)
Cascade golden-mantled ground squirrel (revision 1010416079)
Chordate (revision 1008964469)
Cliff chipmunk (revision 1010412814)
Colorado chipmunk (revision 1010412919)
Daurian ground squirrel (revision 1010413422)
Deosai National Park (revision 1006913741)
Doi (identifier) (revision 1010427488)
Durango chipmunk (revision 1010413819)
EPPO Code (revision 998151320)
Eastern chipmunk (revision 999177830)
Encyclopedia of Life (revision 994178741)
Espíritu Santo antelope squirrel (revision 1010414324)
Ethnology (revision 1011057083)
Eulipotyphla (revision 1012652578)
Eurasian Steppe (revision 1013064344)
European ground squirrel (revision 1010414381)
Eutamias (revision 1010406609)
Extinction (revision 1011028396)
Fauna Europaea (revision 963073975)
Flower (revision 1010385350)
Forest-steppe marmot (revision 1010436539)
Forrest's rock squirrel (revision 1010437668)
France (revision 1012524494)
Franklin's ground squirrel (revision 1010415067)
French Alps (revision 1006041101)
GND (identifier) (revision 1010440981)
Gallo-Romance languages (revision 1012668074)
Genus (revision 1007184632)
Global Biodiversity Information Facility (revision 1010489511)
Gold (revision 1012856700)
Gold-digging ant (revision 1007959560)
Golden-mantled ground squirrel (revision 1010416079)
Gray-collared chipmunk (revision 1010416642)
Gray-footed chipmunk (revision 1010416658)
Gray marmot (revision 1010416479)
Ground squirrel (revision 1010442953)
Groundhog Day (revision 1012802985)
Gunnison's prairie dog (revision 1010416998)
Harris's antelope squirrel (revision 1010417210)
Herbivore (revision 1006902225)
Herodotus (revision 1012927818)
Hibernate (revision 1009048926)
Hibernation (revision 1009048926)
Himalayan marmot (revision 1010417424)
Hoary marmot (revision 1010417525)
Hopi chipmunk (revision 1010417623)
INaturalist (revision 1009815294)
ISBN (identifier) (revision 1009586768)
Ictidomys (revision 1010406819)
Ictidomys parvidens (revision 1010426310)
Integrated Taxonomic Information System (revision 999235988)
Interim Register of Marine and Nonmarine Genera (revision 995182351)
JSTOR (identifier) (revision 1011078319)
Jacopo Ligozzi (revision 1006687935)
Johann Friedrich Blumenbach (revision 1006564504)
Kazakhstan (revision 1012748504)
LCCN (identifier) (revision 1006934344)
Ladakh (revision 1010799326)
Latin (revision 1012971392)
Least chipmunk (revision 1010419221)
== End of Parsed pages ==
- Wikipedia parsing ended at: 2021-03-19 23:29:33.380471
59 characters appeared 59 times.
Most Frequent characters:
[ 0] Char m: 1.694915254237288 %
[ 1] Char a: 1.694915254237288 %
[ 2] Char r: 1.694915254237288 %
[ 3] Char o: 1.694915254237288 %
[ 4] Char t: 1.694915254237288 %
[ 5] Char s: 1.694915254237288 %
[ 6] Char e: 1.694915254237288 %
[ 7] Char l: 1.694915254237288 %
[ 8] Char i: 1.694915254237288 %
[ 9] Char v: 1.694915254237288 %
[10] Char y: 1.694915254237288 %
[11] Char g: 1.694915254237288 %
[12] Char u: 1.694915254237288 %
[13] Char n: 1.694915254237288 %
[14] Char d: 1.694915254237288 %
[15] Char q: 1.694915254237288 %
[16] Char h: 1.694915254237288 %
[17] Char w: 1.694915254237288 %
[18] Char p: 1.694915254237288 %
[19] Char c: 1.694915254237288 %
[20] Char b: 1.694915254237288 %
[21] Char f: 1.694915254237288 %
[22] Char k: 1.694915254237288 %
[23] Char x: 1.694915254237288 %
[24] Char z: 1.694915254237288 %
[25] Char j: 1.694915254237288 %
[26] Char á: 1.694915254237288 %
[27] Char ö: 1.694915254237288 %
[28] Char ä: 1.694915254237288 %
[29] Char í: 1.694915254237288 %
[30] Char ç: 1.694915254237288 %
[31] Char ô: 1.694915254237288 %
[32] Char à: 1.694915254237288 %
[33] Char ü: 1.694915254237288 %
[34] Char æ: 1.694915254237288 %
[35] Char é: 1.694915254237288 %
[36] Char ï: 1.694915254237288 %
[37] Char û: 1.694915254237288 %
[38] Char ó: 1.694915254237288 %
[39] Char µ: 1.694915254237288 %
[40] Char è: 1.694915254237288 %
[41] Char ì: 1.694915254237288 %
[42] Char î: 1.694915254237288 %
[43] Char ë: 1.694915254237288 %
[44] Char ð: 1.694915254237288 %
[45] Char ý: 1.694915254237288 %
[46] Char š: 1.694915254237288 %
[47] Char ñ: 1.694915254237288 %
[48] Char œ: 1.694915254237288 %
[49] Char ê: 1.694915254237288 %
[50] Char â: 1.694915254237288 %
[51] Char ø: 1.694915254237288 %
[52] Char þ: 1.694915254237288 %
[53] Char å: 1.694915254237288 %
[54] Char ß: 1.694915254237288 %
[55] Char ã: 1.694915254237288 %
[56] Char ž: 1.694915254237288 %
[57] Char õ: 1.694915254237288 %
[58] Char ú: 1.694915254237288 %
The first 59 characters have an accumulated ratio of 0.9999999999999989.
920 sequences found.
First 378 (typical positive ratio): 0.9950109024233114
Next 182 (560-378): 0.003993012537786833
Rest: 0.000996085038901806
- Processing end: 2021-03-19 23:29:33.474226

64
script/langs/en.py Normal file
View File

@ -0,0 +1,64 @@
#!/bin/python3
# -*- coding: utf-8 -*-
# ##### BEGIN LICENSE BLOCK #####
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Jehan <jehan@girinstud.io>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ##### END LICENSE BLOCK #####
import re
## Mandatory Properties ##
# The human name for the language, in English.
name = 'English'
# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
# or use another catalog as a last resort.
code = 'en'
# ASCII characters are also used in French.
use_ascii = True
# The charsets we want to support and create data for.
charsets = ['ISO-8859-1', 'WINDOWS-1252']
## Optional Properties ##
# The start page. Though optional, it is advised to choose one yourself.
start_pages = ['Marmot']
# give possibility to select another code for the Wikipedia URL.
wikipedia_code = code
# 'a' and 'A' will be considered the same character, and so on.
# This uses Python algorithm to determine upper/lower-case of a given
# character.
case_mapping = True

View File

@ -11,6 +11,7 @@ set(
LangModels/LangBulgarianModel.cpp
LangModels/LangCroatianModel.cpp
LangModels/LangCzechModel.cpp
LangModels/LangEnglishModel.cpp
LangModels/LangEsperantoModel.cpp
LangModels/LangEstonianModel.cpp
LangModels/LangFinnishModel.cpp

View File

@ -0,0 +1,289 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Communicator client code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsLanguageDetector.h"
/********* Language model for: English *********/
/**
* Generated by BuildLangModel.py
* On: 2021-03-19 23:29:33.380823
**/
/* Character Mapping Table:
* ILL: illegal character.
* CTR: control character specific to the charset.
* RET: carriage/return.
* SYM: symbol (punctuation) that does not belong to word.
* NUM: 0 - 9.
*
* Other characters are ordered by probabilities
* (0 is the most common character in the language).
*
* Orders are generic to a language. So the codepoint with order X in
* CHARSET1 maps to the same character as the codepoint with the same
* order X in CHARSET2 for the same language.
* As such, it is possible to get missing order. For instance the
* ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
* even though they are both used for French. Same for the euro sign.
*/
static const unsigned char Iso_8859_1_CharToOrderMap[] =
{
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 1, 20, 19, 14, 6, 21, 11, 16, 8, 25, 22, 7, 0, 13, 3, /* 4X */
18, 15, 2, 5, 4, 12, 9, 17, 23, 10, 24,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 1, 20, 19, 14, 6, 21, 11, 16, 8, 25, 22, 7, 0, 13, 3, /* 6X */
18, 15, 2, 5, 4, 12, 9, 17, 23, 10, 24,SYM,SYM,SYM,SYM,CTR, /* 7X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
SYM,SYM,SYM,SYM,SYM, 39,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
32, 26, 50, 55, 28, 53, 34, 30, 40, 35, 49, 43, 41, 29, 42, 36, /* CX */
44, 47, 59, 38, 31, 57, 27,SYM, 51, 60, 58, 37, 33, 45, 52, 54, /* DX */
32, 26, 50, 55, 28, 53, 34, 30, 40, 35, 49, 43, 41, 29, 42, 36, /* EX */
44, 47, 61, 38, 31, 57, 27,SYM, 51, 62, 58, 37, 33, 45, 52, 63, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
static const unsigned char Windows_1252_CharToOrderMap[] =
{
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 1, 20, 19, 14, 6, 21, 11, 16, 8, 25, 22, 7, 0, 13, 3, /* 4X */
18, 15, 2, 5, 4, 12, 9, 17, 23, 10, 24,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 1, 20, 19, 14, 6, 21, 11, 16, 8, 25, 22, 7, 0, 13, 3, /* 6X */
18, 15, 2, 5, 4, 12, 9, 17, 23, 10, 24,SYM,SYM,SYM,SYM,CTR, /* 7X */
SYM,ILL,SYM, 64,SYM,SYM,SYM,SYM,SYM,SYM, 46,SYM, 48,ILL, 56,ILL, /* 8X */
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 46,SYM, 48,ILL, 56, 65, /* 9X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
SYM,SYM,SYM,SYM,SYM, 39,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
32, 26, 50, 55, 28, 53, 34, 30, 40, 35, 49, 43, 41, 29, 42, 36, /* CX */
44, 47, 66, 38, 31, 57, 27,SYM, 51, 67, 58, 37, 33, 45, 52, 54, /* DX */
32, 26, 50, 55, 28, 53, 34, 30, 40, 35, 49, 43, 41, 29, 42, 36, /* EX */
44, 47, 68, 38, 31, 57, 27,SYM, 51, 69, 58, 37, 33, 45, 52, 70, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
static const int Unicode_Char_size = 117;
static const unsigned int Unicode_CharOrder[] =
{
65, 1, 66, 20, 67, 19, 68, 14, 69, 6, 70, 21, 71, 11, 72, 16,
73, 8, 74, 25, 75, 22, 76, 7, 77, 0, 78, 13, 79, 3, 80, 18,
81, 15, 82, 2, 83, 5, 84, 4, 85, 12, 86, 9, 87, 17, 88, 23,
89, 10, 90, 24, 97, 1, 98, 20, 99, 19, 100, 14, 101, 6,102, 21,
103, 11, 104, 16, 105, 8, 106, 25, 107, 22, 108, 7, 109, 0,110, 13,
111, 3, 112, 18, 113, 15, 114, 2, 115, 5, 116, 4, 117, 12,118, 9,
119, 17, 120, 23, 121, 10, 122, 24, 181, 39, 192, 32, 193, 26,194, 50,
195, 55, 196, 28, 197, 53, 198, 34, 199, 30, 200, 40, 201, 35,202, 49,
203, 43, 204, 41, 205, 29, 206, 42, 207, 36, 208, 44, 209, 47,211, 38,
212, 31, 213, 57, 214, 27, 216, 51, 218, 58, 219, 37, 220, 33,221, 45,
222, 52, 223, 54, 224, 32, 225, 26, 226, 50, 227, 55, 228, 28,229, 53,
230, 34, 231, 30, 232, 40, 233, 35, 234, 49, 235, 43, 236, 41,237, 29,
238, 42, 239, 36, 240, 44, 241, 47, 243, 38, 244, 31, 245, 57,246, 27,
248, 51, 250, 58, 251, 37, 252, 33, 253, 45, 254, 52, 338, 48,339, 48,
352, 46, 353, 46, 381, 56, 382, 56, 924, 39,
};
/* Model Table:
* Total considered sequences: 920 / 3481
* - Positive sequences: first 378 (0.9950109024233114)
* - Probable sequences: next 182 (560-378) (0.003993012537786833)
* - Neutral sequences: last 2921 (0.000996085038901806)
* - Negative sequences: 2561 (off-ratio)
* Negative sequences: TODO
*/
static const PRUint8 EnglishLangModel[] =
{
3,3,1,3,2,3,3,2,3,1,3,2,3,3,2,1,2,1,3,2,3,2,1,1,1,1,2,1,1,
1,0,0,1,1,1,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,
3,2,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,0,0,
0,1,0,0,0,0,1,1,0,0,0,0,0,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,1,2,1,1,1,1,
1,0,1,0,0,0,2,0,1,1,0,1,0,1,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,0,0,
0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,2,3,2,3,3,1,2,3,3,2,3,2,2,2,0,3,0,1,0,0,
1,0,0,1,1,0,2,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,3,3,1,3,2,3,3,2,3,3,3,3,3,3,3,3,1,1,1,1,1,0,
1,0,0,0,0,0,2,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,1,0,0,
1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,3,3,3,3,3,3,0,2,1,1,0,1,
1,0,1,0,0,0,2,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,2,
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,1,3,3,3,3,3,3,3,2,1,0,0,
1,0,0,0,0,0,1,0,0,1,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
1,3,2,3,1,2,3,2,3,2,2,1,2,1,1,0,1,0,1,1,0,0,1,0,0,1,1,0,0,
1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,1,2,2,3,3,3,0,1,2,3,3,2,1,2,2,2,1,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,1,3,2,2,1,2,2,1,0,2,1,1,2,0,
1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,1,2,1,3,3,3,3,3,3,2,2,1,0,0,
1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,2,2,3,1,1,0,
2,2,1,0,0,1,2,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
3,3,3,3,2,3,3,3,3,2,3,3,3,2,3,2,3,3,3,2,2,3,1,1,2,2,0,0,0,
1,0,1,0,1,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
0,2,0,0,1,1,0,0,2,1,1,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,1,3,2,3,3,2,2,1,3,1,2,2,1,1,0,1,0,1,1,1,
1,0,1,0,1,1,2,0,0,1,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,
1,3,3,3,3,3,3,3,3,0,2,1,1,3,2,0,3,2,1,2,2,2,2,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,0,3,2,3,1,2,0,3,2,3,2,2,0,2,1,1,0,1,0,0,
2,0,1,0,0,2,2,0,0,1,0,1,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,0,3,2,3,2,2,2,3,0,0,3,1,2,3,0,2,0,1,0,0,
1,0,1,1,0,0,2,0,1,0,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
2,3,3,3,3,3,3,3,3,1,3,2,3,3,2,0,2,1,2,3,3,2,1,0,1,3,0,0,0,
0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,
2,3,3,3,3,2,3,3,3,0,3,1,3,1,2,0,0,1,1,2,1,3,1,0,0,1,0,1,0,
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,
3,3,3,3,2,3,3,3,3,0,3,2,3,3,1,1,3,2,2,2,2,1,1,0,1,1,1,2,0,
0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,
1,3,0,3,3,2,3,0,3,2,2,0,3,0,1,1,2,1,3,3,0,2,0,2,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,3,1,3,2,2,3,2,3,1,2,0,2,2,1,0,2,1,0,0,2,0,1,0,2,1,0,0,0,
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,2,3,0,3,3,1,3,1,1,0,3,1,0,0,1,1,1,0,0,0,1,0,0,2,1,1,0,
0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,0,1,1,0,2,0,1,0,1,1,2,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,1,0,2,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
0,2,1,0,2,2,0,1,0,1,0,0,0,2,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,2,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
1,0,0,0,2,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,0,1,0,1,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,1,1,0,0,0,0,0,0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,1,2,1,2,2,2,2,1,2,0,2,1,2,2,1,0,0,2,2,1,0,0,0,1,1,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,1,0,0,2,0,1,0,1,0,0,0,2,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,2,0,0,1,0,0,0,1,0,1,0,1,1,2,0,0,0,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,2,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,0,2,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};
const SequenceModel Iso_8859_1EnglishModel =
{
Iso_8859_1_CharToOrderMap,
EnglishLangModel,
59,
(float)0.9990039149610982,
PR_TRUE,
"ISO-8859-1",
"en"
};
const SequenceModel Windows_1252EnglishModel =
{
Windows_1252_CharToOrderMap,
EnglishLangModel,
59,
(float)0.9990039149610982,
PR_TRUE,
"WINDOWS-1252",
"en"
};
const LanguageModel EnglishModel =
{
"en",
Unicode_CharOrder,
117,
EnglishLangModel,
59,
(float)0.9999999999999989,
};

View File

@ -116,6 +116,7 @@ extern const LanguageModel ArabicModel;
extern const LanguageModel CroatianModel;
extern const LanguageModel CzechModel;
extern const LanguageModel DanishModel;
extern const LanguageModel EnglishModel;
extern const LanguageModel EsperantoModel;
extern const LanguageModel EstonianModel;
extern const LanguageModel FinnishModel;

View File

@ -96,6 +96,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
langDetectors[i][j++] = new nsLanguageDetector(&CroatianModel);
langDetectors[i][j++] = new nsLanguageDetector(&CzechModel);
langDetectors[i][j++] = new nsLanguageDetector(&DanishModel);
langDetectors[i][j++] = new nsLanguageDetector(&EnglishModel);
langDetectors[i][j++] = new nsLanguageDetector(&EsperantoModel);
langDetectors[i][j++] = new nsLanguageDetector(&EstonianModel);
langDetectors[i][j++] = new nsLanguageDetector(&FinnishModel);

View File

@ -49,7 +49,7 @@
#include "nsEUCTWProber.h"
#define NUM_OF_PROBERS 8
#define NUM_OF_LANGUAGES 29
#define NUM_OF_LANGUAGES 30
class nsMBCSGroupProber: public nsCharSetProber {
public:

View File

@ -197,6 +197,9 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[103] = new nsSingleByteCharSetProber(&Windows_1252NorwegianModel);
mProbers[104] = new nsSingleByteCharSetProber(&Ibm865NorwegianModel);
mProbers[104] = new nsSingleByteCharSetProber(&Iso_8859_1EnglishModel);
mProbers[105] = new nsSingleByteCharSetProber(&Windows_1252EnglishModel);
Reset();
}

View File

@ -40,7 +40,7 @@
#define nsSBCSGroupProber_h__
#define NUM_OF_SBCS_PROBERS 105
#define NUM_OF_SBCS_PROBERS 106
class nsCharSetProber;
class nsSBCSGroupProber: public nsCharSetProber {

View File

@ -179,6 +179,9 @@ extern const SequenceModel Iso_8859_1DanishModel;
extern const SequenceModel Windows_1252DanishModel;
extern const SequenceModel Ibm865DanishModel;
extern const SequenceModel Iso_8859_1EnglishModel;
extern const SequenceModel Windows_1252EnglishModel;
extern const SequenceModel Iso_8859_13LithuanianModel;
extern const SequenceModel Iso_8859_10LithuanianModel;
extern const SequenceModel Iso_8859_4LithuanianModel;