script, src: rebuild the Danish model.

Now that it has IBM865 support on the main branch and that I rebased,
this feature branch for the new API got broken too.
This commit is contained in:
Jehan 2022-11-30 20:58:37 +01:00
parent 0be80a21db
commit b5b75b81ce
4 changed files with 341 additions and 223 deletions

View File

@ -1,156 +1,240 @@
= Logs of language model for Danish (da) = = Logs of language model for Danish (da) =
- Generated by BuildLangModel.py - Generated by BuildLangModel.py
- Started: 2021-03-16 01:32:17.684746 - Started: 2022-11-30 20:49:10.182568
- Maximum depth: 4 - Maximum depth: 2
- Max number of pages: 100 - Max number of pages: 200
== Parsed pages == == Parsed pages ==
Forside (revision 10000691) Forside (revision 10000691)
1. symfoni (Beethoven) (revision 10648993) 15. januar (revision 10515606)
15. marts (revision 8172123) IC4 (revision 11317878)
1917 (revision 10645384) VM i fodbold 2022 (mænd) (revision 11344039)
1930 (revision 10645389) 28. november (revision 9410945)
1940 (revision 10648721) Forenede Nationer (revision 11199108)
1951 (revision 10640371) Middelaldercentret (revision 11339897)
1972 (revision 10641861) Vilhelm Erobreren (revision 11279565)
Casper & Mandrilaftalen (revision 11221713)
Nikolaj Lie Kaas (revision 11322663)
Stig Hoffmeyer (revision 11340274)
Rock and Roll Hall of Fame (revision 8408189)
Anwar Ibrahim (revision 11342876)
Afrikamesterskabet i håndbold 2022 (kvinder) (revision 11341917)
1940 (revision 11263756)
1937 (revision 11303923)
1934 (revision 11224625)
Danmarksdemokraterne (revision 11335570)
The Julekalender (revision 11341242)
Ruslands invasion af Ukraine 2022 (revision 11335164)
25. november (revision 10378454)
The Jimi Hendrix Experience (revision 10497780)
24. november (revision 6877891)
Vikingetidens rustning og våben (revision 11332607)
Torben Rechendorff (revision 11342962)
Thomas Edison (revision 11052704)
1947 (revision 11252357)
Eurovision Song Contest 2014 (revision 11333950)
29. november (revision 6877900)
Ukraine (revision 11334630)
1990 (revision 11340072)
Maurice Norman (revision 11342318)
Sergej Sjojgu (revision 11309097)
Færøerne (revision 11333678)
Fonograf (revision 11032483)
Folketingsvalget 2022 (revision 11339557)
Hans Magnus Enzensberger (revision 11341046)
Moderaterne (revision 11305861)
Hawaii (revision 11317011)
Mandan (indfødte amerikanere) (revision 11336303)
SI-præfiks (revision 11332802)
Encyklopædi (revision 11315276)
Storbritannien (revision 11329834)
1991 (revision 11250037)
Det Konservative Folkeparti (revision 11313857)
Wandsworth-skjoldet (revision 11341402)
Angolas håndboldlandshold (damer) (revision 11331888)
Shu-bi-dua (revision 11324736)
1877 (revision 11224901)
Kon-Tiki (revision 10615971)
Socialdemokratiet (revision 11325315)
Donatan (revision 10586146)
Adolf Hitler (revision 11317375)
Procent (revision 10764365)
1. juni (revision 10206137)
1863 (revision 11081613)
ISO 3166-1 alpha-3 (revision 11250626)
Senegals håndboldlandshold (herrer) (revision 8621578)
Billion (revision 11039345)
Lørdag (revision 11159889)
Sachsen (revision 11299889)
Vestindien (revision 11330329)
Folketingsvalget 1988 (revision 10970017)
Dogme 95 (revision 10973606)
Encyclopédie (revision 11314734)
Afrikamesterskabet i håndbold 2018 (mænd) (revision 11131830)
Mew (revision 11308840)
2. marts (revision 9423344) 2. marts (revision 9423344)
2003 (revision 10654209) Rajon (revision 11185598)
44 f.Kr. (revision 7242128) TheTVDB (revision 10969052)
7. marts (revision 9423388) Skueproces (revision 11322041)
9. marts (revision 10601197) New York Times (revision 10236433)
Abdikation (revision 10197388) 2006 (revision 11271490)
Afsnit af Badehotellet (revision 10654331) Jacinda Ardern (revision 11243495)
Agnes Slott-Møller (revision 10648962) 8. maj (revision 9423405)
Australian Open-mesterskabet i damesingle 2021 (revision 10630904) 7. juni (revision 10287352)
Australian Open-mesterskabet i herresingle 2021 (revision 10630887) Ray Charles (revision 10893843)
Australian Open 2021 (revision 10630544) Dansk Rock - fra pigtråd til punk (revision 10970784)
Casper & Mandrilaftalen (revision 10444147) 1950'erne (revision 10917112)
Coronaviruspandemien (revision 10652415) John Wesley Hyatt (revision 9405508)
Cykling under sommer-OL 2012 Linjeløb (kvinder) (revision 10651872) Landsdel Hovedstaden (revision 10723037)
Dansk (sprog) (revision 10633727) Zar-Rusland (revision 11328111)
Den danske Treårsekspedition til Østgrønland 1931-34 (revision 10654093) 1816 (revision 11198312)
Dnepr (revision 10635465) Engelsk litteratur (revision 10817139)
Donald Trump (revision 10653185) 22. november (revision 10203064)
Døde i 2021 (revision 10653976) Maj (revision 11288718)
Encyklopædi (revision 10590147) Progressiv rock (revision 11259601)
Eurovision Song Contest 2014 (revision 10592331) Maurice Setters (revision 10936371)
Folkerepublikken Kina (revision 10634829) Minkkommissionen (revision 11337058)
Folketinget (revision 10643927) Ragnhild Hveger (revision 11072132)
Fram-ekspeditionen 1910-1912 (revision 10630146) 1961 (revision 11224941)
Frankrig (revision 10648749) Montenegro (revision 11340028)
Frankrigs præsidenter (revision 10477099) Socialkonservatisme (revision 8745187)
Geologi (revision 10631000) TV 2 (revision 11339141)
Geoteknik (revision 10603548) 7. februar (revision 9423377)
Greater London (revision 10380043) Ar (enhed) (revision 11309905)
Hortus Botanicus Amsterdam (revision 8854568) 1881 (revision 11144791)
Hu Jintao (revision 10610855) Etiopisk kalender (revision 9931290)
IC4 (revision 10577458) Ethelbert Nevin (revision 10591854)
Idus martius (revision 10652897) The Moscow Times (revision 11329355)
Inger Støjberg (revision 10643259) 1960'erne (revision 11261802)
Italiens premierministre (revision 10625575) 15. november (revision 6877873)
John Polkinghorne (revision 10654447) Politikens Forlag (revision 11322941)
Julius Cæsar (revision 10653812) Island (revision 11219029)
Korruption (revision 10401686) Danmark (revision 11313400)
Lars Göran Petrov (revision 10650013) Det Kongelige Teater (revision 11319106)
London Underground (revision 10635531) 20. juni (revision 10232768)
Marge Simpson (revision 10640942) VM i fodbold 1958 (revision 11014260)
Mario Draghi (revision 10652699) Næste folketingsvalg (revision 11338101)
Matilde af Skotland (revision 10648200) Virtual International Authority File (revision 8702589)
Metrosystemer i verden (revision 10510595) Marmor (revision 11309004)
Middelaldercentret (revision 10574228) Oslo (revision 11290885)
Naomi Osaka (revision 10478959) 1938 (revision 11336099)
Nederlandene (revision 10642742) Frie Grønne (revision 11294501)
Nicolas Sarkozy (revision 10639376) Lottorp (revision 11223312)
Nikolaj 2. af Rusland (revision 10639924) 1931 (revision 11236350)
Novak Djokovic (revision 10479710) 1930 (revision 11252037)
Outlaw Gentlemen & Shady Ladies (revision 10492201) Albanien (revision 11309379)
Paris-Nice 2021 (revision 10653019) Holger Begtrup (revision 10289352)
Rigsretssagen mod Donald Trump 2021 (revision 10653875) 1887 (revision 11250123)
Rigsretssagen mod Inger Støjberg (revision 10643260) Kristen Helveg Petersen (revision 10505239)
Rusland (revision 10631140) Benito Mussolini (revision 11311831)
Sanja Ilić (revision 10645645) Tamilrapporten (revision 10672604)
Senat (revision 10429780) Internationale Valutafond (revision 10871884)
Senatet (USA) (revision 10624834) Ron Flowers (revision 10999963)
Shu-bi-dua (revision 10630614) Scud-missil (revision 11072276)
Svend Johansen (skuespiller) (revision 10643631) 1860'erne (revision 8151963)
Tennis (revision 10651841) 11. november (revision 10903885)
Tommy Troelsen (revision 10648382) 10. november (revision 9286344)
Træsko (revision 10626215) 1697 (revision 10865232)
USA's præsidenter (revision 10639768) Det Humanistiske Parti (revision 10898925)
Undergrundsbane (revision 10541653) 1998 (revision 11342743)
Vilhelm Erobreren (revision 10631208) Centrum-Demokraterne (revision 11201902)
Wikimedia (revision 10260889) Præstens Urskov (revision 10261164)
Wikipedia (revision 10627445) Kraghave (Tingsted Sogn) (revision 11124871)
Zar (revision 10557166) Burkina Faso (revision 11309150)
1800 (revision 10645359) Johannes Peter Frederik Königsfeldt (revision 10942128)
2. april (revision 9568657) John Bardeen (revision 10622362)
Burgtheater (revision 9296862) Retsforbundet (revision 11333888)
C-dur (revision 10513719) Mykolaiv oblast (revision 11215109)
Cello (revision 10641506) Folketingsvalget 1932 (revision 10529645)
Coda (revision 9298442) Atassut (revision 11250468)
Dominant (revision 9513277) 1780 (revision 10879041)
Dynamik (musik) (revision 9504157) Pokalvindernes Europa Cup (revision 10533322)
F-dur (revision 8135200) Harmonium (revision 10648166)
Fagot (revision 10578018) Litra MA (revision 10707516)
Fløjte (revision 10329382) 14. oktober (revision 9764309)
Harmonik (revision 10577145) Letland i Eurovision Song Contest (revision 11273114)
International Music Score Library Project (revision 10115839) Den røde tråd (sang) (revision 11117198)
Italienske og franske musikudtryk (revision 10352094) Peter A.G. Nielsen (revision 11311663)
Johann Georg Albrechtsberger (revision 10289540) Internationalt Standardbognummer (revision 11037702)
Joseph Haydn (revision 10289602) Denys Sjmyhal (revision 11184932)
Klarinet (revision 10490230) Souvenir (revision 10530474)
Klassicisme (musik) (revision 10436811) Kristendemokraterne (revision 11310458)
Kontrabas (revision 10147393) Edward Gibbon (revision 11316150)
Kontrapunkt (musikteori) (revision 10184029) 19. november (revision 10910432)
Leipzig (revision 10611798) Aarhus Hovedbanegård (revision 11254458)
Ludwig van Beethoven (revision 10642134) Grækere (revision 11277065)
Moderaterna (revision 11275745)
Margrethe 2. (revision 11264709)
1978 (revision 11340075)
Demokratiske Republik Congos håndboldlandshold (damer) (revision 11330801)
Philip af Storbritannien (revision 11307679)
21. århundrede (revision 9838559)
Jørgen Christensen (handelsminister) (revision 9548745)
Holger Juul Hansen (revision 11316843)
Fodboldspiller (revision 11234361)
Parliamo italiano (revision 11322505)
Borgerlig (revision 10930991)
Mail (revision 10885336)
Disko (revision 10767773)
Tunesiens fodboldlandshold (revision 11334411)
6. december (revision 10378463)
Erhvervspartiet (1978-79) (revision 8449157)
Sovjetunionen (revision 11333771)
1567 (revision 10818742)
1875 (revision 11198318)
Hubble-teleskopet (revision 11304842)
Hærulfstenen (revision 11317806)
Frankrig (revision 11235194)
Coney Island (revision 11211594)
1952 (revision 11243498)
== End of Parsed pages == == End of Parsed pages ==
- Wikipedia parsing ended at: 2021-03-16 01:36:49.098009 - Wikipedia parsing ended at: 2022-11-30 20:52:37.002648
57 characters appeared 1058523 times. 63 characters appeared 1374958 times.
First 30 characters: Most Frequent characters:
[ 0] Char e: 15.118707859914238 % [ 0] Char e: 14.79056087531401 %
[ 1] Char r: 8.552388564065213 % [ 1] Char r: 8.641427592697378 %
[ 2] Char n: 7.6833474567864855 % [ 2] Char n: 7.613105273033795 %
[ 3] Char t: 7.125305732610439 % [ 3] Char t: 6.915483963873806 %
[ 4] Char a: 6.351302711419591 % [ 4] Char a: 6.583692010955971 %
[ 5] Char i: 6.265806222443915 % [ 5] Char i: 6.462524673480935 %
[ 6] Char s: 6.152629654716997 % [ 6] Char s: 6.347902990491345 %
[ 7] Char d: 5.90341447469729 % [ 7] Char d: 5.849924143137463 %
[ 8] Char o: 5.144999211164992 % [ 8] Char l: 5.1523755634717565 %
[ 9] Char l: 5.1253491893893655 % [ 9] Char o: 4.9496784629057755 %
[10] Char g: 3.907992551885977 % [10] Char g: 3.827389636628901 %
[11] Char m: 3.3046990948708723 % [11] Char m: 3.251226582921078 %
[12] Char k: 3.0474538578755492 % [12] Char k: 3.2378443559730554 %
[13] Char f: 2.586434116216653 % [13] Char f: 2.605170485207548 %
[14] Char v: 2.2680659749481116 % [14] Char v: 2.205303725641074 %
[15] Char u: 1.9654745338551927 % [15] Char u: 1.978242244490377 %
[16] Char b: 1.7524418458550264 % [16] Char b: 1.8278376503136822 %
[17] Char p: 1.6338804163915193 % [17] Char p: 1.5923395478261881 %
[18] Char h: 1.5844719481768466 % [18] Char h: 1.5512473835564433 %
[19] Char ø: 0.7598323324103491 % [19] Char ø: 0.88409973250092 %
[20] Char æ: 0.7542585281566863 % [20] Char æ: 0.7078761678538544 %
[21] Char å: 0.728278932059105 % [21] Char å: 0.7005304889312983 %
[22] Char y: 0.6751860847615027 % [22] Char y: 0.6576200873044848 %
[23] Char c: 0.6527963964883143 % [23] Char c: 0.648019794059164 %
[24] Char j: 0.5847770903419198 % [24] Char j: 0.646928851644923 %
[25] Char w: 0.17241004682940286 % [25] Char w: 0.14465896412835882 %
[26] Char z: 0.0783166733268904 % [26] Char z: 0.06814753614292218 %
[27] Char x: 0.05602145631223884 % [27] Char x: 0.03643747663564996 %
[28] Char é: 0.019177665482941794 % [28] Char é: 0.020946094353427522 %
[29] Char q: 0.016626941502452003 % [29] Char ó: 0.013600415430871343 %
[30] Char q: 0.013018579476609468 %
The first 30 characters have an accumulated ratio of 0.9997184756495605. The first 31 characters have an accumulated ratio of 0.9992516135038306.
936 sequences found. 1079 sequences found.
First 512 (typical positive ratio): 0.9962304038307248 First 508 (typical positive ratio): 0.995012453333286
Next 512 (512-1024): 0.007598323324103491 Next 198 (706-508): 0.003993410296057376
Rest: -5.2909066017292616e-17 Rest: 0.0009941363706565953
- Processing end: 2021-03-16 01:36:49.182013 - Processing end: 2022-11-30 20:52:37.084319

View File

@ -42,7 +42,7 @@
/** /**
* Generated by BuildLangModel.py * Generated by BuildLangModel.py
* On: 2021-03-16 01:36:49.098484 * On: 2022-11-30 20:52:37.003457
**/ **/
/* Character Mapping Table: /* Character Mapping Table:
@ -68,18 +68,18 @@ static const unsigned char Iso_8859_15_CharToOrderMap[] =
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 4X */ SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 4X */
17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 6X */ SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 6X */
17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
SYM,SYM,SYM,SYM,SYM,SYM, 40,SYM, 40,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ SYM,SYM,SYM,SYM,SYM,SYM, 44,SYM, 44,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
SYM,SYM,SYM,SYM, 56, 52,SYM,SYM, 56,SYM,SYM,SYM, 57, 58, 59,SYM, /* BX */ SYM,SYM,SYM,SYM, 55, 56,SYM,SYM, 55,SYM,SYM,SYM, 63, 64, 65,SYM, /* BX */
41, 32, 48, 60, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* CX */ 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 66, 35, 67, 62, /* CX */
49, 54, 47, 35, 42, 61, 30,SYM, 19, 55, 38, 62, 31, 51, 50, 44, /* DX */ 32, 49, 60, 29, 48, 68, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 45, /* DX */
41, 32, 48, 63, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* EX */ 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 69, 35, 70, 62, /* EX */
49, 54, 47, 35, 42, 64, 30,SYM, 19, 55, 38, 65, 31, 51, 50, 66, /* FX */ 32, 49, 60, 29, 48, 71, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 72, /* FX */
}; };
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
@ -89,18 +89,18 @@ static const unsigned char Iso_8859_1_CharToOrderMap[] =
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 4X */ SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 4X */
17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 6X */ SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 6X */
17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
SYM,SYM,SYM,SYM,SYM, 52,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ SYM,SYM,SYM,SYM,SYM, 56,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
41, 32, 48, 67, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* CX */ 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 73, 35, 74, 62, /* CX */
49, 54, 47, 35, 42, 68, 30,SYM, 19, 55, 38, 69, 31, 51, 50, 44, /* DX */ 32, 49, 60, 29, 48, 75, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 45, /* DX */
41, 32, 48, 70, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* EX */ 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 76, 35, 77, 62, /* EX */
49, 54, 47, 35, 42, 71, 30,SYM, 19, 55, 38, 72, 31, 51, 50, 73, /* FX */ 32, 49, 60, 29, 48, 78, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 79, /* FX */
}; };
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
@ -110,74 +110,97 @@ static const unsigned char Windows_1252_CharToOrderMap[] =
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 4X */ SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 4X */
17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 6X */ SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 6X */
17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
SYM,ILL,SYM, 74,SYM,SYM,SYM,SYM,SYM,SYM, 40,SYM, 75,ILL, 56,ILL, /* 8X */ SYM,ILL,SYM, 80,SYM,SYM,SYM,SYM,SYM,SYM, 44,SYM, 81,ILL, 55,ILL, /* 8X */
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 40,SYM, 76,ILL, 56, 77, /* 9X */ ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 44,SYM, 82,ILL, 55, 83, /* 9X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
SYM,SYM,SYM,SYM,SYM, 52,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ SYM,SYM,SYM,SYM,SYM, 56,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
41, 32, 48, 78, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* CX */ 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 84, 35, 85, 62, /* CX */
49, 54, 47, 35, 42, 79, 30,SYM, 19, 55, 38, 80, 31, 51, 50, 44, /* DX */ 32, 49, 60, 29, 48, 86, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 45, /* DX */
41, 32, 48, 81, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* EX */ 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 87, 35, 88, 62, /* EX */
49, 54, 47, 35, 42, 82, 30,SYM, 19, 55, 38, 83, 31, 51, 50, 84, /* FX */ 32, 49, 60, 29, 48, 89, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 90, /* FX */
}; };
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
static const int Unicode_Char_size = 60; static const unsigned char Ibm865_CharToOrderMap[] =
{
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 4X */
17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 6X */
17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
40, 31, 28, 52, 38, 46, 21, 40, 43, 36, 39, 62, 91, 92, 38, 21, /* 8X */
28, 20, 20, 48, 33, 60, 59, 61, 93, 33, 31, 19,SYM, 19,SYM,SYM, /* 9X */
34, 35, 29, 37, 49, 49,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* DX */
47, 45, 94, 54, 57, 57, 56, 58, 50, 95, 96, 97, 98, 50, 51,SYM, /* EX */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
static const int Unicode_Char_size = 62;
static const unsigned int Unicode_CharOrder[] = static const unsigned int Unicode_CharOrder[] =
{ {
65, 4, 66, 16, 67, 23, 68, 7, 69, 0, 70, 13, 71, 10, 72, 18, 65, 4, 66, 16, 67, 23, 68, 7, 69, 0, 70, 13, 71, 10, 72, 18,
73, 5, 74, 24, 75, 12, 76, 9, 77, 11, 78, 2, 79, 8, 80, 17, 73, 5, 74, 24, 75, 12, 76, 8, 77, 11, 78, 2, 79, 9, 80, 17,
81, 29, 82, 1, 83, 6, 84, 3, 85, 15, 86, 14, 87, 25, 88, 27, 81, 30, 82, 1, 83, 6, 84, 3, 85, 15, 86, 14, 87, 25, 88, 27,
89, 22, 90, 26, 97, 4, 98, 16, 99, 23, 100, 7, 101, 0,102, 13, 89, 22, 90, 26, 97, 4, 98, 16, 99, 23, 100, 7, 101, 0,102, 13,
103, 10, 104, 18, 105, 5, 106, 24, 107, 12, 108, 9, 109, 11,110, 2, 103, 10, 104, 18, 105, 5, 106, 24, 107, 12, 108, 8, 109, 11,110, 2,
111, 8, 112, 17, 113, 29, 114, 1, 115, 6, 116, 3, 117, 15,118, 14, 111, 9, 112, 17, 113, 30, 114, 1, 115, 6, 116, 3, 117, 15,118, 14,
119, 25, 120, 27, 121, 22, 122, 26, 197, 21, 198, 20, 201, 28,216, 19, 119, 25, 120, 27, 121, 22, 122, 26, 197, 21, 198, 20, 201, 28,211, 29,
229, 21, 230, 20, 233, 28, 248, 19, 216, 19, 229, 21, 230, 20, 233, 28, 243, 29, 248, 19,
}; };
/* Model Table: /* Model Table:
* Total sequences: 936 * Total considered sequences: 1079 / 961
* First 512 sequences: 0.9962304038307248 * - Positive sequences: first 508 (0.995012453333286)
* Next 512 sequences (512-1024): 0.003769596169275244 * - Probable sequences: next 198 (706-508) (0.003993410296057376)
* Rest: -5.2909066017292616e-17 * - Neutral sequences: last 255 (0.0009941363706565953)
* - Negative sequences: -118 (off-ratio)
* Negative sequences: TODO * Negative sequences: TODO
*/ */
static const PRUint8 DanishLangModel[] = static const PRUint8 DanishLangModel[] =
{ {
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,3,3,2,3,0,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,1,2,3,3,3,3,2,3,1,0,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,1,2,2,1,
3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,0, 3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,1,2,3,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,0,3,3,3,3,3,3,0,2, 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,0,1,0,3,3,3,3,3,3,0,0,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,2,3,3,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,1,3,3,1,3,3,1,0,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,1,1,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,0,2,0, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,1,0,2,2,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,3,3,3,3,3,2,0,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,1,1,0,2,1,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,0,1,3,3,3,3,2,2,1,0,1,
3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,3,2,3,2,2,0,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,3,3,3,3,3,2,3,2,0,0,1,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,0,2,0, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,1,1,1,2,1,0,
3,3,3,3,3,3,3,2,3,3,2,3,3,2,3,3,2,2,3,3,3,3,3,2,2,2,0,0,2,0, 3,3,3,3,3,3,3,2,3,3,2,3,3,2,3,3,2,2,3,3,3,3,3,1,3,1,1,1,1,1,0,
3,3,2,3,3,3,3,2,3,3,3,2,2,3,3,3,3,2,3,3,3,3,3,2,3,2,0,3,2,0, 3,3,3,3,3,3,3,2,3,3,3,2,2,3,2,3,2,2,3,3,3,3,3,2,3,1,1,2,1,1,0,
3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,3,3,2,3,2,3,3,2,0,3,0,2,0,0,2, 3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,3,2,2,2,3,3,3,2,1,3,0,0,0,1,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,2,0,2,3,2,2,2,2,0,2, 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,1,1,0,2,3,2,2,2,3,1,0,0,
3,3,3,3,3,3,3,3,3,3,0,2,2,2,0,3,3,2,2,3,3,3,3,2,3,2,2,0,2,0, 3,3,3,3,3,3,3,2,3,3,0,2,1,1,1,3,3,1,2,3,3,3,3,2,3,1,1,0,1,2,0,
3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,3,2,3,3,2,2,2,0,2,0,2,0, 3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,3,2,3,3,3,1,1,0,1,0,2,1,0,
3,3,3,3,3,3,3,2,3,2,2,3,2,2,3,3,2,2,2,3,3,3,3,2,3,2,0,0,2,0, 3,3,3,3,3,3,3,2,3,3,1,3,2,2,3,3,1,1,2,3,3,3,3,1,3,3,0,1,1,1,2,
3,3,3,3,2,2,3,3,0,3,3,3,3,3,3,2,3,2,2,0,0,0,2,2,3,0,0,0,0,0, 3,3,3,3,1,2,3,3,3,1,3,3,3,2,3,1,3,2,1,0,0,0,2,0,3,0,0,0,0,0,0,
2,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,2,2,0,0,0,0,2,0,0,0,0,0,0, 2,3,3,3,1,3,3,3,3,3,3,3,3,3,3,2,3,2,1,0,0,0,0,2,0,0,0,0,0,0,0,
3,3,3,3,0,0,3,3,2,3,2,2,3,2,3,0,3,2,2,0,0,0,0,0,0,0,0,0,0,0, 3,3,3,3,0,0,3,3,3,1,2,1,3,2,3,0,3,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,2,3,3,2,2,0,0,0,2,2,2,2,0,0,0, 3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,2,3,3,2,1,0,0,0,2,2,1,2,1,0,0,0,
3,3,2,3,3,3,2,2,3,3,2,2,3,2,2,3,2,2,3,0,3,0,3,3,0,0,2,0,2,2, 3,3,1,3,3,3,2,3,3,3,1,2,3,1,1,3,1,1,3,0,1,0,3,3,1,1,1,0,1,0,1,
3,3,2,3,3,3,3,3,3,3,2,2,2,2,2,3,3,2,2,3,3,2,3,0,0,2,2,0,2,0, 3,3,3,3,3,3,3,3,3,3,2,1,2,1,1,3,2,1,2,3,3,1,3,0,0,0,0,0,1,2,0,
3,2,2,2,3,3,2,2,3,2,0,2,2,2,0,2,2,0,3,0,2,0,2,2,0,2,0,0,0,0, 3,2,2,2,3,3,2,1,2,3,0,1,1,1,0,2,2,0,2,0,1,0,1,1,1,2,1,0,0,0,0,
3,2,2,2,3,3,2,2,3,0,2,2,2,0,2,2,2,2,2,0,0,0,2,2,2,2,2,2,0,0, 3,1,1,1,3,3,1,1,1,3,1,2,1,1,0,2,1,1,1,0,0,0,2,1,2,1,3,0,0,0,0,
3,2,2,2,3,3,2,0,2,2,0,0,0,2,2,2,2,2,0,0,0,0,0,2,0,2,0,2,0,0, 2,1,1,1,2,3,1,1,2,2,0,0,1,1,1,1,1,2,2,1,0,0,1,2,0,1,0,1,0,0,0,
2,2,3,2,2,0,2,2,2,2,2,2,0,0,2,0,2,2,0,0,0,0,0,2,0,0,2,0,0,0, 2,2,3,2,1,0,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,0,0,1,0,0,0,0,
0,2,0,0,2,2,0,2,2,2,0,0,2,2,0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,2, 0,3,2,2,1,0,1,0,2,0,1,0,2,0,1,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,
2,0,0,0,2,2,0,0,0,0,0,0,0,1,1,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,
}; };
@ -185,8 +208,8 @@ const SequenceModel Iso_8859_15DanishModel =
{ {
Iso_8859_15_CharToOrderMap, Iso_8859_15_CharToOrderMap,
DanishLangModel, DanishLangModel,
30, 31,
(float)0.9962304038307248, (float)0.9990058636293434,
PR_TRUE, PR_TRUE,
"ISO-8859-15", "ISO-8859-15",
"da" "da"
@ -196,8 +219,8 @@ const SequenceModel Iso_8859_1DanishModel =
{ {
Iso_8859_1_CharToOrderMap, Iso_8859_1_CharToOrderMap,
DanishLangModel, DanishLangModel,
30, 31,
(float)0.9962304038307248, (float)0.9990058636293434,
PR_TRUE, PR_TRUE,
"ISO-8859-1", "ISO-8859-1",
"da" "da"
@ -207,19 +230,30 @@ const SequenceModel Windows_1252DanishModel =
{ {
Windows_1252_CharToOrderMap, Windows_1252_CharToOrderMap,
DanishLangModel, DanishLangModel,
30, 31,
(float)0.9962304038307248, (float)0.9990058636293434,
PR_TRUE, PR_TRUE,
"WINDOWS-1252", "WINDOWS-1252",
"da" "da"
}; };
const SequenceModel Ibm865DanishModel =
{
Ibm865_CharToOrderMap,
DanishLangModel,
31,
(float)0.9990058636293434,
PR_TRUE,
"IBM865",
"da"
};
const LanguageModel DanishModel = const LanguageModel DanishModel =
{ {
"da", "da",
Unicode_CharOrder, Unicode_CharOrder,
60, 62,
DanishLangModel, DanishLangModel,
30, 31,
(float)0.9962304038307248, (float)0.9992516135038306,
}; };

View File

@ -197,8 +197,8 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[103] = new nsSingleByteCharSetProber(&Windows_1252NorwegianModel); mProbers[103] = new nsSingleByteCharSetProber(&Windows_1252NorwegianModel);
mProbers[104] = new nsSingleByteCharSetProber(&Ibm865NorwegianModel); mProbers[104] = new nsSingleByteCharSetProber(&Ibm865NorwegianModel);
mProbers[104] = new nsSingleByteCharSetProber(&Iso_8859_1EnglishModel); mProbers[105] = new nsSingleByteCharSetProber(&Iso_8859_1EnglishModel);
mProbers[105] = new nsSingleByteCharSetProber(&Windows_1252EnglishModel); mProbers[106] = new nsSingleByteCharSetProber(&Windows_1252EnglishModel);
Reset(); Reset();
} }

View File

@ -40,7 +40,7 @@
#define nsSBCSGroupProber_h__ #define nsSBCSGroupProber_h__
#define NUM_OF_SBCS_PROBERS 106 #define NUM_OF_SBCS_PROBERS 107
class nsCharSetProber; class nsCharSetProber;
class nsSBCSGroupProber: public nsCharSetProber { class nsSBCSGroupProber: public nsCharSetProber {