diff --git a/script/BuildLangModelLogs/LangHungarianModel.log b/script/BuildLangModelLogs/LangHungarianModel.log new file mode 100644 index 0000000..f04ad98 --- /dev/null +++ b/script/BuildLangModelLogs/LangHungarianModel.log @@ -0,0 +1,109 @@ += Logs of language model for Hungarian (hu) = + +- Generated by BuildLangModel.py +- Started: 2015-12-12 18:01:21.560682 +- Maximum depth: 2 +- Max number of pages: 50 + +== Parsed pages == + +Kezdőlap (revision 12748721) +1722 (revision 16471860) +1780 (revision 16407861) +1800 (revision 15028835) +1831 (revision 16469576) +1848–49-es forradalom és szabadságharc (revision 16955214) +1875 (revision 16798555) +1895 (revision 16649417) +1900 (revision 16961019) +1905 (revision 16601113) +1915 (revision 16792868) +1940 (revision 16936087) +1950 (revision 16820817) +1970 (revision 16093156) +1985 (revision 16463340) +1995 (revision 16945805) +1998 (revision 16542908) +2003 (revision 16943939) +2015 (revision 16960983) +73. Golden Globe-gála (revision 16937296) +Akacuki (revision 16960353) +Akasztottak erdeje (regény) (revision 16918702) +Alan Hodgkinson (revision 16953214) +Alfred Bernhard Nobel (revision 16654409) +Alkotmány (revision 16784843) +André-Marie Ampère (revision 16865419) +Angela Merkel (revision 16960753) +Anne Baxter (revision 15572176) +Az irgalmasság rendkívüli szentéve (revision 16951018) +Az év embereinek listája (revision 16961722) +Bencések (revision 16853524) +Boeing 747–400 (revision 16947261) +Chantal Szent Johanna Franciska (revision 16371923) +December 12. (revision 15637986) +December 13. (revision 16546152) +Dinamó (revision 15949492) +Dionne Warwick (revision 16522754) +Elektrodinamika (revision 14888277) +Elektromosság (revision 16051899) +Enciklopédia (revision 16556513) +Eric Maskin (revision 16907781) +Európai migrációs válság (revision 16922218) +Eötvös Loránd (revision 16960057) +Eötvös Loránd Tudományegyetem (revision 16684410) +Fellner Jakab (revision 16960223) +Feltaláló (revision 13609621) +Ferenc pápa (revision 16928970) +Frank Sinatra (revision 16927399) +François Jean Dominique Arago (revision 16197941) +Gabriella (revision 16906500) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2015-12-12 18:02:46.729734 + +55 characters appeared 375370 times. + +First 32 characters: +[ 0] Char e: 9.710685457015744 % +[ 1] Char a: 8.803314063457389 % +[ 2] Char t: 7.322375256413672 % +[ 3] Char s: 6.666222660308496 % +[ 4] Char l: 5.73967019207715 % +[ 5] Char r: 5.4341050163838345 % +[ 6] Char n: 5.39920611663159 % +[ 7] Char i: 4.773689959240216 % +[ 8] Char o: 4.347976663025815 % +[ 9] Char k: 4.289634227562138 % +[10] Char z: 4.244611982843594 % +[11] Char á: 3.7855982097663636 % +[12] Char m: 3.2144284306151265 % +[13] Char g: 3.0727016010869277 % +[14] Char é: 3.0295441830727015 % +[15] Char b: 2.287609558568879 % +[16] Char d: 1.9966965926952074 % +[17] Char v: 1.8832085675466872 % +[18] Char y: 1.8453792258305137 % +[19] Char u: 1.5155713029810587 % +[20] Char h: 1.2960545595012922 % +[21] Char p: 1.288861656498921 % +[22] Char j: 1.2363801049631031 % +[23] Char c: 1.0951860830647095 % +[24] Char f: 1.0256546873751233 % +[25] Char ö: 1.020859418706876 % +[26] Char ó: 0.9955510562911262 % +[27] Char ő: 0.8399712283879905 % +[28] Char í: 0.6340410794682579 % +[29] Char ü: 0.4211844313610571 % +[30] Char ú: 0.3295415190345526 % +[31] Char ű: 0.2056637451048299 % + +The first 32 characters have an accumulated ratio of 0.9975117883688093. + +1084 sequences found. + +First 512 (typical positive ratio): 0.9748272224933486 +Next 512 (512-1024): 5.328076298052588e-06 +Rest: 0.0001889139024889644 + +- Processing end: 2015-12-12 18:02:46.902033 diff --git a/script/charsets/iso-8859-2.py b/script/charsets/iso-8859-2.py new file mode 100644 index 0000000..110b4f7 --- /dev/null +++ b/script/charsets/iso-8859-2.py @@ -0,0 +1,73 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'ISO-8859-2' +aliases = ['ISO_8859-2:1987', 'ISO_8859-2', 'iso-ir-101', + 'csISOLatin2', 'latin2', 'l2'] + +language = \ +{ + 'complete': [ 'bs', 'hr', 'cs', 'de', 'hu', 'pl', 'sr', 'sk', 'sl', + 'hsb', 'dsb', 'tk' ], + 'incomplete': [ 'ro' ] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X + SYM,LET,SYM,LET,SYM,LET,LET,SYM,SYM,LET,LET,LET,LET,SYM,LET,LET, # AX + SYM,LET,SYM,LET,SYM,LET,LET,SYM,SYM,LET,LET,LET,LET,SYM,LET,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX +] diff --git a/script/charsets/windows-1250.py b/script/charsets/windows-1250.py new file mode 100644 index 0000000..f28547c --- /dev/null +++ b/script/charsets/windows-1250.py @@ -0,0 +1,75 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'WINDOWS-1250' +aliases = ['cswindows1250'] + +language = \ +{ + # used under Microsoft Windows to represent texts in Central European and + # Eastern European languages that use Latin script, such as Polish, Czech, + # Slovak, Hungarian, Slovene, Bosnian, Croatian, Serbian (Latin script), + # Romanian (before 1993 spelling reform) and Albanian. + 'complete': [ 'pl', 'hu', 'sl', 'bs', 'hr', 'sr', 'ro', 'sq', 'de' ], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM,LET,SYM,LET,LET,LET,LET, # 8X + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,LET,SYM,LET,LET,LET,LET, # 9X + SYM,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,LET, # AX + SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,LET,LET,SYM,LET,SYM,LET,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX +] diff --git a/script/langs/hu.py b/script/langs/hu.py new file mode 100644 index 0000000..8ff01cb --- /dev/null +++ b/script/langs/hu.py @@ -0,0 +1,74 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Hungarian' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'hu' +# Q, W, X, Y are only used for foreign words. +use_ascii = False +# The charsets we want to support and create data for. +charsets = ['ISO-8859-2', 'WINDOWS-1250'] + +## Optional Properties ## + +# Alphabet characters: I separate to make missing letters fully visible. +alphabet = 'abcdefghijklmnop' + 'rstuv' + 'z' + 'áéíóöőúüű' +# The start page. Though optional, it is advised to choose one yourself. +start_pages = ['Kezdőlap'] +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True + +# A function to clean content returned by the `wikipedia` python lib, +# in case some unwanted data has been overlooked. +def clean_wikipedia_content(content): + cleaned = re.sub(r'(=+) *([^=]+) *\1', + r'\2', + content) + return cleaned