script, src: rebuild the Danish model.

Now that it has IBM865 support on the main branch and that I rebased,
this feature branch for the new API got broken too.
This commit is contained in:
Jehan 2022-11-30 20:58:37 +01:00
parent 0be80a21db
commit b5b75b81ce
4 changed files with 341 additions and 223 deletions

View File

@ -1,156 +1,240 @@
= Logs of language model for Danish (da) =
- Generated by BuildLangModel.py
- Started: 2021-03-16 01:32:17.684746
- Maximum depth: 4
- Max number of pages: 100
- Started: 2022-11-30 20:49:10.182568
- Maximum depth: 2
- Max number of pages: 200
== Parsed pages ==
Forside (revision 10000691)
1. symfoni (Beethoven) (revision 10648993)
15. marts (revision 8172123)
1917 (revision 10645384)
1930 (revision 10645389)
1940 (revision 10648721)
1951 (revision 10640371)
1972 (revision 10641861)
15. januar (revision 10515606)
IC4 (revision 11317878)
VM i fodbold 2022 (mænd) (revision 11344039)
28. november (revision 9410945)
Forenede Nationer (revision 11199108)
Middelaldercentret (revision 11339897)
Vilhelm Erobreren (revision 11279565)
Casper & Mandrilaftalen (revision 11221713)
Nikolaj Lie Kaas (revision 11322663)
Stig Hoffmeyer (revision 11340274)
Rock and Roll Hall of Fame (revision 8408189)
Anwar Ibrahim (revision 11342876)
Afrikamesterskabet i håndbold 2022 (kvinder) (revision 11341917)
1940 (revision 11263756)
1937 (revision 11303923)
1934 (revision 11224625)
Danmarksdemokraterne (revision 11335570)
The Julekalender (revision 11341242)
Ruslands invasion af Ukraine 2022 (revision 11335164)
25. november (revision 10378454)
The Jimi Hendrix Experience (revision 10497780)
24. november (revision 6877891)
Vikingetidens rustning og våben (revision 11332607)
Torben Rechendorff (revision 11342962)
Thomas Edison (revision 11052704)
1947 (revision 11252357)
Eurovision Song Contest 2014 (revision 11333950)
29. november (revision 6877900)
Ukraine (revision 11334630)
1990 (revision 11340072)
Maurice Norman (revision 11342318)
Sergej Sjojgu (revision 11309097)
Færøerne (revision 11333678)
Fonograf (revision 11032483)
Folketingsvalget 2022 (revision 11339557)
Hans Magnus Enzensberger (revision 11341046)
Moderaterne (revision 11305861)
Hawaii (revision 11317011)
Mandan (indfødte amerikanere) (revision 11336303)
SI-præfiks (revision 11332802)
Encyklopædi (revision 11315276)
Storbritannien (revision 11329834)
1991 (revision 11250037)
Det Konservative Folkeparti (revision 11313857)
Wandsworth-skjoldet (revision 11341402)
Angolas håndboldlandshold (damer) (revision 11331888)
Shu-bi-dua (revision 11324736)
1877 (revision 11224901)
Kon-Tiki (revision 10615971)
Socialdemokratiet (revision 11325315)
Donatan (revision 10586146)
Adolf Hitler (revision 11317375)
Procent (revision 10764365)
1. juni (revision 10206137)
1863 (revision 11081613)
ISO 3166-1 alpha-3 (revision 11250626)
Senegals håndboldlandshold (herrer) (revision 8621578)
Billion (revision 11039345)
Lørdag (revision 11159889)
Sachsen (revision 11299889)
Vestindien (revision 11330329)
Folketingsvalget 1988 (revision 10970017)
Dogme 95 (revision 10973606)
Encyclopédie (revision 11314734)
Afrikamesterskabet i håndbold 2018 (mænd) (revision 11131830)
Mew (revision 11308840)
2. marts (revision 9423344)
2003 (revision 10654209)
44 f.Kr. (revision 7242128)
7. marts (revision 9423388)
9. marts (revision 10601197)
Abdikation (revision 10197388)
Afsnit af Badehotellet (revision 10654331)
Agnes Slott-Møller (revision 10648962)
Australian Open-mesterskabet i damesingle 2021 (revision 10630904)
Australian Open-mesterskabet i herresingle 2021 (revision 10630887)
Australian Open 2021 (revision 10630544)
Casper & Mandrilaftalen (revision 10444147)
Coronaviruspandemien (revision 10652415)
Cykling under sommer-OL 2012 Linjeløb (kvinder) (revision 10651872)
Dansk (sprog) (revision 10633727)
Den danske Treårsekspedition til Østgrønland 1931-34 (revision 10654093)
Dnepr (revision 10635465)
Donald Trump (revision 10653185)
Døde i 2021 (revision 10653976)
Encyklopædi (revision 10590147)
Eurovision Song Contest 2014 (revision 10592331)
Folkerepublikken Kina (revision 10634829)
Folketinget (revision 10643927)
Fram-ekspeditionen 1910-1912 (revision 10630146)
Frankrig (revision 10648749)
Frankrigs præsidenter (revision 10477099)
Geologi (revision 10631000)
Geoteknik (revision 10603548)
Greater London (revision 10380043)
Hortus Botanicus Amsterdam (revision 8854568)
Hu Jintao (revision 10610855)
IC4 (revision 10577458)
Idus martius (revision 10652897)
Inger Støjberg (revision 10643259)
Italiens premierministre (revision 10625575)
John Polkinghorne (revision 10654447)
Julius Cæsar (revision 10653812)
Korruption (revision 10401686)
Lars Göran Petrov (revision 10650013)
London Underground (revision 10635531)
Marge Simpson (revision 10640942)
Mario Draghi (revision 10652699)
Matilde af Skotland (revision 10648200)
Metrosystemer i verden (revision 10510595)
Middelaldercentret (revision 10574228)
Naomi Osaka (revision 10478959)
Nederlandene (revision 10642742)
Nicolas Sarkozy (revision 10639376)
Nikolaj 2. af Rusland (revision 10639924)
Novak Djokovic (revision 10479710)
Outlaw Gentlemen & Shady Ladies (revision 10492201)
Paris-Nice 2021 (revision 10653019)
Rigsretssagen mod Donald Trump 2021 (revision 10653875)
Rigsretssagen mod Inger Støjberg (revision 10643260)
Rusland (revision 10631140)
Sanja Ilić (revision 10645645)
Senat (revision 10429780)
Senatet (USA) (revision 10624834)
Shu-bi-dua (revision 10630614)
Svend Johansen (skuespiller) (revision 10643631)
Tennis (revision 10651841)
Tommy Troelsen (revision 10648382)
Træsko (revision 10626215)
USA's præsidenter (revision 10639768)
Undergrundsbane (revision 10541653)
Vilhelm Erobreren (revision 10631208)
Wikimedia (revision 10260889)
Wikipedia (revision 10627445)
Zar (revision 10557166)
1800 (revision 10645359)
2. april (revision 9568657)
Burgtheater (revision 9296862)
C-dur (revision 10513719)
Cello (revision 10641506)
Coda (revision 9298442)
Dominant (revision 9513277)
Dynamik (musik) (revision 9504157)
F-dur (revision 8135200)
Fagot (revision 10578018)
Fløjte (revision 10329382)
Harmonik (revision 10577145)
International Music Score Library Project (revision 10115839)
Italienske og franske musikudtryk (revision 10352094)
Johann Georg Albrechtsberger (revision 10289540)
Joseph Haydn (revision 10289602)
Klarinet (revision 10490230)
Klassicisme (musik) (revision 10436811)
Kontrabas (revision 10147393)
Kontrapunkt (musikteori) (revision 10184029)
Leipzig (revision 10611798)
Ludwig van Beethoven (revision 10642134)
Rajon (revision 11185598)
TheTVDB (revision 10969052)
Skueproces (revision 11322041)
New York Times (revision 10236433)
2006 (revision 11271490)
Jacinda Ardern (revision 11243495)
8. maj (revision 9423405)
7. juni (revision 10287352)
Ray Charles (revision 10893843)
Dansk Rock - fra pigtråd til punk (revision 10970784)
1950'erne (revision 10917112)
John Wesley Hyatt (revision 9405508)
Landsdel Hovedstaden (revision 10723037)
Zar-Rusland (revision 11328111)
1816 (revision 11198312)
Engelsk litteratur (revision 10817139)
22. november (revision 10203064)
Maj (revision 11288718)
Progressiv rock (revision 11259601)
Maurice Setters (revision 10936371)
Minkkommissionen (revision 11337058)
Ragnhild Hveger (revision 11072132)
1961 (revision 11224941)
Montenegro (revision 11340028)
Socialkonservatisme (revision 8745187)
TV 2 (revision 11339141)
7. februar (revision 9423377)
Ar (enhed) (revision 11309905)
1881 (revision 11144791)
Etiopisk kalender (revision 9931290)
Ethelbert Nevin (revision 10591854)
The Moscow Times (revision 11329355)
1960'erne (revision 11261802)
15. november (revision 6877873)
Politikens Forlag (revision 11322941)
Island (revision 11219029)
Danmark (revision 11313400)
Det Kongelige Teater (revision 11319106)
20. juni (revision 10232768)
VM i fodbold 1958 (revision 11014260)
Næste folketingsvalg (revision 11338101)
Virtual International Authority File (revision 8702589)
Marmor (revision 11309004)
Oslo (revision 11290885)
1938 (revision 11336099)
Frie Grønne (revision 11294501)
Lottorp (revision 11223312)
1931 (revision 11236350)
1930 (revision 11252037)
Albanien (revision 11309379)
Holger Begtrup (revision 10289352)
1887 (revision 11250123)
Kristen Helveg Petersen (revision 10505239)
Benito Mussolini (revision 11311831)
Tamilrapporten (revision 10672604)
Internationale Valutafond (revision 10871884)
Ron Flowers (revision 10999963)
Scud-missil (revision 11072276)
1860'erne (revision 8151963)
11. november (revision 10903885)
10. november (revision 9286344)
1697 (revision 10865232)
Det Humanistiske Parti (revision 10898925)
1998 (revision 11342743)
Centrum-Demokraterne (revision 11201902)
Præstens Urskov (revision 10261164)
Kraghave (Tingsted Sogn) (revision 11124871)
Burkina Faso (revision 11309150)
Johannes Peter Frederik Königsfeldt (revision 10942128)
John Bardeen (revision 10622362)
Retsforbundet (revision 11333888)
Mykolaiv oblast (revision 11215109)
Folketingsvalget 1932 (revision 10529645)
Atassut (revision 11250468)
1780 (revision 10879041)
Pokalvindernes Europa Cup (revision 10533322)
Harmonium (revision 10648166)
Litra MA (revision 10707516)
14. oktober (revision 9764309)
Letland i Eurovision Song Contest (revision 11273114)
Den røde tråd (sang) (revision 11117198)
Peter A.G. Nielsen (revision 11311663)
Internationalt Standardbognummer (revision 11037702)
Denys Sjmyhal (revision 11184932)
Souvenir (revision 10530474)
Kristendemokraterne (revision 11310458)
Edward Gibbon (revision 11316150)
19. november (revision 10910432)
Aarhus Hovedbanegård (revision 11254458)
Grækere (revision 11277065)
Moderaterna (revision 11275745)
Margrethe 2. (revision 11264709)
1978 (revision 11340075)
Demokratiske Republik Congos håndboldlandshold (damer) (revision 11330801)
Philip af Storbritannien (revision 11307679)
21. århundrede (revision 9838559)
Jørgen Christensen (handelsminister) (revision 9548745)
Holger Juul Hansen (revision 11316843)
Fodboldspiller (revision 11234361)
Parliamo italiano (revision 11322505)
Borgerlig (revision 10930991)
Mail (revision 10885336)
Disko (revision 10767773)
Tunesiens fodboldlandshold (revision 11334411)
6. december (revision 10378463)
Erhvervspartiet (1978-79) (revision 8449157)
Sovjetunionen (revision 11333771)
1567 (revision 10818742)
1875 (revision 11198318)
Hubble-teleskopet (revision 11304842)
Hærulfstenen (revision 11317806)
Frankrig (revision 11235194)
Coney Island (revision 11211594)
1952 (revision 11243498)
== End of Parsed pages ==
- Wikipedia parsing ended at: 2021-03-16 01:36:49.098009
- Wikipedia parsing ended at: 2022-11-30 20:52:37.002648
57 characters appeared 1058523 times.
63 characters appeared 1374958 times.
First 30 characters:
[ 0] Char e: 15.118707859914238 %
[ 1] Char r: 8.552388564065213 %
[ 2] Char n: 7.6833474567864855 %
[ 3] Char t: 7.125305732610439 %
[ 4] Char a: 6.351302711419591 %
[ 5] Char i: 6.265806222443915 %
[ 6] Char s: 6.152629654716997 %
[ 7] Char d: 5.90341447469729 %
[ 8] Char o: 5.144999211164992 %
[ 9] Char l: 5.1253491893893655 %
[10] Char g: 3.907992551885977 %
[11] Char m: 3.3046990948708723 %
[12] Char k: 3.0474538578755492 %
[13] Char f: 2.586434116216653 %
[14] Char v: 2.2680659749481116 %
[15] Char u: 1.9654745338551927 %
[16] Char b: 1.7524418458550264 %
[17] Char p: 1.6338804163915193 %
[18] Char h: 1.5844719481768466 %
[19] Char ø: 0.7598323324103491 %
[20] Char æ: 0.7542585281566863 %
[21] Char å: 0.728278932059105 %
[22] Char y: 0.6751860847615027 %
[23] Char c: 0.6527963964883143 %
[24] Char j: 0.5847770903419198 %
[25] Char w: 0.17241004682940286 %
[26] Char z: 0.0783166733268904 %
[27] Char x: 0.05602145631223884 %
[28] Char é: 0.019177665482941794 %
[29] Char q: 0.016626941502452003 %
Most Frequent characters:
[ 0] Char e: 14.79056087531401 %
[ 1] Char r: 8.641427592697378 %
[ 2] Char n: 7.613105273033795 %
[ 3] Char t: 6.915483963873806 %
[ 4] Char a: 6.583692010955971 %
[ 5] Char i: 6.462524673480935 %
[ 6] Char s: 6.347902990491345 %
[ 7] Char d: 5.849924143137463 %
[ 8] Char l: 5.1523755634717565 %
[ 9] Char o: 4.9496784629057755 %
[10] Char g: 3.827389636628901 %
[11] Char m: 3.251226582921078 %
[12] Char k: 3.2378443559730554 %
[13] Char f: 2.605170485207548 %
[14] Char v: 2.205303725641074 %
[15] Char u: 1.978242244490377 %
[16] Char b: 1.8278376503136822 %
[17] Char p: 1.5923395478261881 %
[18] Char h: 1.5512473835564433 %
[19] Char ø: 0.88409973250092 %
[20] Char æ: 0.7078761678538544 %
[21] Char å: 0.7005304889312983 %
[22] Char y: 0.6576200873044848 %
[23] Char c: 0.648019794059164 %
[24] Char j: 0.646928851644923 %
[25] Char w: 0.14465896412835882 %
[26] Char z: 0.06814753614292218 %
[27] Char x: 0.03643747663564996 %
[28] Char é: 0.020946094353427522 %
[29] Char ó: 0.013600415430871343 %
[30] Char q: 0.013018579476609468 %
The first 30 characters have an accumulated ratio of 0.9997184756495605.
The first 31 characters have an accumulated ratio of 0.9992516135038306.
936 sequences found.
1079 sequences found.
First 512 (typical positive ratio): 0.9962304038307248
Next 512 (512-1024): 0.007598323324103491
Rest: -5.2909066017292616e-17
First 508 (typical positive ratio): 0.995012453333286
Next 198 (706-508): 0.003993410296057376
Rest: 0.0009941363706565953
- Processing end: 2021-03-16 01:36:49.182013
- Processing end: 2022-11-30 20:52:37.084319

View File

@ -42,7 +42,7 @@
/**
* Generated by BuildLangModel.py
* On: 2021-03-16 01:36:49.098484
* On: 2022-11-30 20:52:37.003457
**/
/* Character Mapping Table:
@ -68,18 +68,18 @@ static const unsigned char Iso_8859_15_CharToOrderMap[] =
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 4X */
17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 6X */
17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 4X */
17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 6X */
17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
SYM,SYM,SYM,SYM,SYM,SYM, 40,SYM, 40,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
SYM,SYM,SYM,SYM, 56, 52,SYM,SYM, 56,SYM,SYM,SYM, 57, 58, 59,SYM, /* BX */
41, 32, 48, 60, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* CX */
49, 54, 47, 35, 42, 61, 30,SYM, 19, 55, 38, 62, 31, 51, 50, 44, /* DX */
41, 32, 48, 63, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* EX */
49, 54, 47, 35, 42, 64, 30,SYM, 19, 55, 38, 65, 31, 51, 50, 66, /* FX */
SYM,SYM,SYM,SYM,SYM,SYM, 44,SYM, 44,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
SYM,SYM,SYM,SYM, 55, 56,SYM,SYM, 55,SYM,SYM,SYM, 63, 64, 65,SYM, /* BX */
46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 66, 35, 67, 62, /* CX */
32, 49, 60, 29, 48, 68, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 45, /* DX */
46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 69, 35, 70, 62, /* EX */
32, 49, 60, 29, 48, 71, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 72, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
@ -89,18 +89,18 @@ static const unsigned char Iso_8859_1_CharToOrderMap[] =
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 4X */
17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 6X */
17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 4X */
17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 6X */
17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
SYM,SYM,SYM,SYM,SYM, 52,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
41, 32, 48, 67, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* CX */
49, 54, 47, 35, 42, 68, 30,SYM, 19, 55, 38, 69, 31, 51, 50, 44, /* DX */
41, 32, 48, 70, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* EX */
49, 54, 47, 35, 42, 71, 30,SYM, 19, 55, 38, 72, 31, 51, 50, 73, /* FX */
SYM,SYM,SYM,SYM,SYM, 56,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 73, 35, 74, 62, /* CX */
32, 49, 60, 29, 48, 75, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 45, /* DX */
46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 76, 35, 77, 62, /* EX */
32, 49, 60, 29, 48, 78, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 79, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
@ -110,74 +110,97 @@ static const unsigned char Windows_1252_CharToOrderMap[] =
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 4X */
17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 6X */
17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
SYM,ILL,SYM, 74,SYM,SYM,SYM,SYM,SYM,SYM, 40,SYM, 75,ILL, 56,ILL, /* 8X */
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 40,SYM, 76,ILL, 56, 77, /* 9X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 4X */
17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 6X */
17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
SYM,ILL,SYM, 80,SYM,SYM,SYM,SYM,SYM,SYM, 44,SYM, 81,ILL, 55,ILL, /* 8X */
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 44,SYM, 82,ILL, 55, 83, /* 9X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
SYM,SYM,SYM,SYM,SYM, 52,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
41, 32, 48, 78, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* CX */
49, 54, 47, 35, 42, 79, 30,SYM, 19, 55, 38, 80, 31, 51, 50, 44, /* DX */
41, 32, 48, 81, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* EX */
49, 54, 47, 35, 42, 82, 30,SYM, 19, 55, 38, 83, 31, 51, 50, 84, /* FX */
SYM,SYM,SYM,SYM,SYM, 56,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 84, 35, 85, 62, /* CX */
32, 49, 60, 29, 48, 86, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 45, /* DX */
46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 87, 35, 88, 62, /* EX */
32, 49, 60, 29, 48, 89, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 90, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
static const int Unicode_Char_size = 60;
static const unsigned char Ibm865_CharToOrderMap[] =
{
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 4X */
17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 6X */
17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
40, 31, 28, 52, 38, 46, 21, 40, 43, 36, 39, 62, 91, 92, 38, 21, /* 8X */
28, 20, 20, 48, 33, 60, 59, 61, 93, 33, 31, 19,SYM, 19,SYM,SYM, /* 9X */
34, 35, 29, 37, 49, 49,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* DX */
47, 45, 94, 54, 57, 57, 56, 58, 50, 95, 96, 97, 98, 50, 51,SYM, /* EX */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
static const int Unicode_Char_size = 62;
static const unsigned int Unicode_CharOrder[] =
{
65, 4, 66, 16, 67, 23, 68, 7, 69, 0, 70, 13, 71, 10, 72, 18,
73, 5, 74, 24, 75, 12, 76, 9, 77, 11, 78, 2, 79, 8, 80, 17,
81, 29, 82, 1, 83, 6, 84, 3, 85, 15, 86, 14, 87, 25, 88, 27,
73, 5, 74, 24, 75, 12, 76, 8, 77, 11, 78, 2, 79, 9, 80, 17,
81, 30, 82, 1, 83, 6, 84, 3, 85, 15, 86, 14, 87, 25, 88, 27,
89, 22, 90, 26, 97, 4, 98, 16, 99, 23, 100, 7, 101, 0,102, 13,
103, 10, 104, 18, 105, 5, 106, 24, 107, 12, 108, 9, 109, 11,110, 2,
111, 8, 112, 17, 113, 29, 114, 1, 115, 6, 116, 3, 117, 15,118, 14,
119, 25, 120, 27, 121, 22, 122, 26, 197, 21, 198, 20, 201, 28,216, 19,
229, 21, 230, 20, 233, 28, 248, 19,
103, 10, 104, 18, 105, 5, 106, 24, 107, 12, 108, 8, 109, 11,110, 2,
111, 9, 112, 17, 113, 30, 114, 1, 115, 6, 116, 3, 117, 15,118, 14,
119, 25, 120, 27, 121, 22, 122, 26, 197, 21, 198, 20, 201, 28,211, 29,
216, 19, 229, 21, 230, 20, 233, 28, 243, 29, 248, 19,
};
/* Model Table:
* Total sequences: 936
* First 512 sequences: 0.9962304038307248
* Next 512 sequences (512-1024): 0.003769596169275244
* Rest: -5.2909066017292616e-17
* Total considered sequences: 1079 / 961
* - Positive sequences: first 508 (0.995012453333286)
* - Probable sequences: next 198 (706-508) (0.003993410296057376)
* - Neutral sequences: last 255 (0.0009941363706565953)
* - Negative sequences: -118 (off-ratio)
* Negative sequences: TODO
*/
static const PRUint8 DanishLangModel[] =
{
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,3,3,2,3,0,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,
3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,0,3,3,3,3,3,3,0,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,2,3,3,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,0,2,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,3,3,3,3,3,2,0,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,3,2,3,2,2,0,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,0,2,0,
3,3,3,3,3,3,3,2,3,3,2,3,3,2,3,3,2,2,3,3,3,3,3,2,2,2,0,0,2,0,
3,3,2,3,3,3,3,2,3,3,3,2,2,3,3,3,3,2,3,3,3,3,3,2,3,2,0,3,2,0,
3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,3,3,2,3,2,3,3,2,0,3,0,2,0,0,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,2,0,2,3,2,2,2,2,0,2,
3,3,3,3,3,3,3,3,3,3,0,2,2,2,0,3,3,2,2,3,3,3,3,2,3,2,2,0,2,0,
3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,3,2,3,3,2,2,2,0,2,0,2,0,
3,3,3,3,3,3,3,2,3,2,2,3,2,2,3,3,2,2,2,3,3,3,3,2,3,2,0,0,2,0,
3,3,3,3,2,2,3,3,0,3,3,3,3,3,3,2,3,2,2,0,0,0,2,2,3,0,0,0,0,0,
2,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,2,2,0,0,0,0,2,0,0,0,0,0,0,
3,3,3,3,0,0,3,3,2,3,2,2,3,2,3,0,3,2,2,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,2,3,3,2,2,0,0,0,2,2,2,2,0,0,0,
3,3,2,3,3,3,2,2,3,3,2,2,3,2,2,3,2,2,3,0,3,0,3,3,0,0,2,0,2,2,
3,3,2,3,3,3,3,3,3,3,2,2,2,2,2,3,3,2,2,3,3,2,3,0,0,2,2,0,2,0,
3,2,2,2,3,3,2,2,3,2,0,2,2,2,0,2,2,0,3,0,2,0,2,2,0,2,0,0,0,0,
3,2,2,2,3,3,2,2,3,0,2,2,2,0,2,2,2,2,2,0,0,0,2,2,2,2,2,2,0,0,
3,2,2,2,3,3,2,0,2,2,0,0,0,2,2,2,2,2,0,0,0,0,0,2,0,2,0,2,0,0,
2,2,3,2,2,0,2,2,2,2,2,2,0,0,2,0,2,2,0,0,0,0,0,2,0,0,2,0,0,0,
0,2,0,0,2,2,0,2,2,2,0,0,2,2,0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,1,2,3,3,3,3,2,3,1,0,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,1,2,2,1,
3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,1,2,3,0,
3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,0,1,0,3,3,3,3,3,3,0,0,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,1,3,3,1,3,3,1,0,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,1,1,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,1,0,2,2,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,1,1,0,2,1,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,0,1,3,3,3,3,2,2,1,0,1,
3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,3,3,3,3,3,2,3,2,0,0,1,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,1,1,1,2,1,0,
3,3,3,3,3,3,3,2,3,3,2,3,3,2,3,3,2,2,3,3,3,3,3,1,3,1,1,1,1,1,0,
3,3,3,3,3,3,3,2,3,3,3,2,2,3,2,3,2,2,3,3,3,3,3,2,3,1,1,2,1,1,0,
3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,3,2,2,2,3,3,3,2,1,3,0,0,0,1,0,0,
3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,1,1,0,2,3,2,2,2,3,1,0,0,
3,3,3,3,3,3,3,2,3,3,0,2,1,1,1,3,3,1,2,3,3,3,3,2,3,1,1,0,1,2,0,
3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,3,2,3,3,3,1,1,0,1,0,2,1,0,
3,3,3,3,3,3,3,2,3,3,1,3,2,2,3,3,1,1,2,3,3,3,3,1,3,3,0,1,1,1,2,
3,3,3,3,1,2,3,3,3,1,3,3,3,2,3,1,3,2,1,0,0,0,2,0,3,0,0,0,0,0,0,
2,3,3,3,1,3,3,3,3,3,3,3,3,3,3,2,3,2,1,0,0,0,0,2,0,0,0,0,0,0,0,
3,3,3,3,0,0,3,3,3,1,2,1,3,2,3,0,3,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,2,3,3,2,1,0,0,0,2,2,1,2,1,0,0,0,
3,3,1,3,3,3,2,3,3,3,1,2,3,1,1,3,1,1,3,0,1,0,3,3,1,1,1,0,1,0,1,
3,3,3,3,3,3,3,3,3,3,2,1,2,1,1,3,2,1,2,3,3,1,3,0,0,0,0,0,1,2,0,
3,2,2,2,3,3,2,1,2,3,0,1,1,1,0,2,2,0,2,0,1,0,1,1,1,2,1,0,0,0,0,
3,1,1,1,3,3,1,1,1,3,1,2,1,1,0,2,1,1,1,0,0,0,2,1,2,1,3,0,0,0,0,
2,1,1,1,2,3,1,1,2,2,0,0,1,1,1,1,1,2,2,1,0,0,1,2,0,1,0,1,0,0,0,
2,2,3,2,1,0,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,0,0,1,0,0,0,0,
0,3,2,2,1,0,1,0,2,0,1,0,2,0,1,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,
2,0,0,0,2,2,0,0,0,0,0,0,0,1,1,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,
};
@ -185,8 +208,8 @@ const SequenceModel Iso_8859_15DanishModel =
{
Iso_8859_15_CharToOrderMap,
DanishLangModel,
30,
(float)0.9962304038307248,
31,
(float)0.9990058636293434,
PR_TRUE,
"ISO-8859-15",
"da"
@ -196,8 +219,8 @@ const SequenceModel Iso_8859_1DanishModel =
{
Iso_8859_1_CharToOrderMap,
DanishLangModel,
30,
(float)0.9962304038307248,
31,
(float)0.9990058636293434,
PR_TRUE,
"ISO-8859-1",
"da"
@ -207,19 +230,30 @@ const SequenceModel Windows_1252DanishModel =
{
Windows_1252_CharToOrderMap,
DanishLangModel,
30,
(float)0.9962304038307248,
31,
(float)0.9990058636293434,
PR_TRUE,
"WINDOWS-1252",
"da"
};
const SequenceModel Ibm865DanishModel =
{
Ibm865_CharToOrderMap,
DanishLangModel,
31,
(float)0.9990058636293434,
PR_TRUE,
"IBM865",
"da"
};
const LanguageModel DanishModel =
{
"da",
Unicode_CharOrder,
60,
62,
DanishLangModel,
30,
(float)0.9962304038307248,
31,
(float)0.9992516135038306,
};

View File

@ -197,8 +197,8 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[103] = new nsSingleByteCharSetProber(&Windows_1252NorwegianModel);
mProbers[104] = new nsSingleByteCharSetProber(&Ibm865NorwegianModel);
mProbers[104] = new nsSingleByteCharSetProber(&Iso_8859_1EnglishModel);
mProbers[105] = new nsSingleByteCharSetProber(&Windows_1252EnglishModel);
mProbers[105] = new nsSingleByteCharSetProber(&Iso_8859_1EnglishModel);
mProbers[106] = new nsSingleByteCharSetProber(&Windows_1252EnglishModel);
Reset();
}

View File

@ -40,7 +40,7 @@
#define nsSBCSGroupProber_h__
#define NUM_OF_SBCS_PROBERS 106
#define NUM_OF_SBCS_PROBERS 107
class nsCharSetProber;
class nsSBCSGroupProber: public nsCharSetProber {