mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-08 01:36:41 +08:00
BuildLangModel: forgot to add charset/language files.
This commit is contained in:
parent
2bade77bf9
commit
6b2722885a
109
script/BuildLangModelLogs/LangHungarianModel.log
Normal file
109
script/BuildLangModelLogs/LangHungarianModel.log
Normal file
@ -0,0 +1,109 @@
|
||||
= Logs of language model for Hungarian (hu) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2015-12-12 18:01:21.560682
|
||||
- Maximum depth: 2
|
||||
- Max number of pages: 50
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Kezdőlap (revision 12748721)
|
||||
1722 (revision 16471860)
|
||||
1780 (revision 16407861)
|
||||
1800 (revision 15028835)
|
||||
1831 (revision 16469576)
|
||||
1848–49-es forradalom és szabadságharc (revision 16955214)
|
||||
1875 (revision 16798555)
|
||||
1895 (revision 16649417)
|
||||
1900 (revision 16961019)
|
||||
1905 (revision 16601113)
|
||||
1915 (revision 16792868)
|
||||
1940 (revision 16936087)
|
||||
1950 (revision 16820817)
|
||||
1970 (revision 16093156)
|
||||
1985 (revision 16463340)
|
||||
1995 (revision 16945805)
|
||||
1998 (revision 16542908)
|
||||
2003 (revision 16943939)
|
||||
2015 (revision 16960983)
|
||||
73. Golden Globe-gála (revision 16937296)
|
||||
Akacuki (revision 16960353)
|
||||
Akasztottak erdeje (regény) (revision 16918702)
|
||||
Alan Hodgkinson (revision 16953214)
|
||||
Alfred Bernhard Nobel (revision 16654409)
|
||||
Alkotmány (revision 16784843)
|
||||
André-Marie Ampère (revision 16865419)
|
||||
Angela Merkel (revision 16960753)
|
||||
Anne Baxter (revision 15572176)
|
||||
Az irgalmasság rendkívüli szentéve (revision 16951018)
|
||||
Az év embereinek listája (revision 16961722)
|
||||
Bencések (revision 16853524)
|
||||
Boeing 747–400 (revision 16947261)
|
||||
Chantal Szent Johanna Franciska (revision 16371923)
|
||||
December 12. (revision 15637986)
|
||||
December 13. (revision 16546152)
|
||||
Dinamó (revision 15949492)
|
||||
Dionne Warwick (revision 16522754)
|
||||
Elektrodinamika (revision 14888277)
|
||||
Elektromosság (revision 16051899)
|
||||
Enciklopédia (revision 16556513)
|
||||
Eric Maskin (revision 16907781)
|
||||
Európai migrációs válság (revision 16922218)
|
||||
Eötvös Loránd (revision 16960057)
|
||||
Eötvös Loránd Tudományegyetem (revision 16684410)
|
||||
Fellner Jakab (revision 16960223)
|
||||
Feltaláló (revision 13609621)
|
||||
Ferenc pápa (revision 16928970)
|
||||
Frank Sinatra (revision 16927399)
|
||||
François Jean Dominique Arago (revision 16197941)
|
||||
Gabriella (revision 16906500)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2015-12-12 18:02:46.729734
|
||||
|
||||
55 characters appeared 375370 times.
|
||||
|
||||
First 32 characters:
|
||||
[ 0] Char e: 9.710685457015744 %
|
||||
[ 1] Char a: 8.803314063457389 %
|
||||
[ 2] Char t: 7.322375256413672 %
|
||||
[ 3] Char s: 6.666222660308496 %
|
||||
[ 4] Char l: 5.73967019207715 %
|
||||
[ 5] Char r: 5.4341050163838345 %
|
||||
[ 6] Char n: 5.39920611663159 %
|
||||
[ 7] Char i: 4.773689959240216 %
|
||||
[ 8] Char o: 4.347976663025815 %
|
||||
[ 9] Char k: 4.289634227562138 %
|
||||
[10] Char z: 4.244611982843594 %
|
||||
[11] Char á: 3.7855982097663636 %
|
||||
[12] Char m: 3.2144284306151265 %
|
||||
[13] Char g: 3.0727016010869277 %
|
||||
[14] Char é: 3.0295441830727015 %
|
||||
[15] Char b: 2.287609558568879 %
|
||||
[16] Char d: 1.9966965926952074 %
|
||||
[17] Char v: 1.8832085675466872 %
|
||||
[18] Char y: 1.8453792258305137 %
|
||||
[19] Char u: 1.5155713029810587 %
|
||||
[20] Char h: 1.2960545595012922 %
|
||||
[21] Char p: 1.288861656498921 %
|
||||
[22] Char j: 1.2363801049631031 %
|
||||
[23] Char c: 1.0951860830647095 %
|
||||
[24] Char f: 1.0256546873751233 %
|
||||
[25] Char ö: 1.020859418706876 %
|
||||
[26] Char ó: 0.9955510562911262 %
|
||||
[27] Char ő: 0.8399712283879905 %
|
||||
[28] Char í: 0.6340410794682579 %
|
||||
[29] Char ü: 0.4211844313610571 %
|
||||
[30] Char ú: 0.3295415190345526 %
|
||||
[31] Char ű: 0.2056637451048299 %
|
||||
|
||||
The first 32 characters have an accumulated ratio of 0.9975117883688093.
|
||||
|
||||
1084 sequences found.
|
||||
|
||||
First 512 (typical positive ratio): 0.9748272224933486
|
||||
Next 512 (512-1024): 5.328076298052588e-06
|
||||
Rest: 0.0001889139024889644
|
||||
|
||||
- Processing end: 2015-12-12 18:02:46.902033
|
||||
73
script/charsets/iso-8859-2.py
Normal file
73
script/charsets/iso-8859-2.py
Normal file
@ -0,0 +1,73 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'ISO-8859-2'
|
||||
aliases = ['ISO_8859-2:1987', 'ISO_8859-2', 'iso-ir-101',
|
||||
'csISOLatin2', 'latin2', 'l2']
|
||||
|
||||
language = \
|
||||
{
|
||||
'complete': [ 'bs', 'hr', 'cs', 'de', 'hu', 'pl', 'sr', 'sk', 'sl',
|
||||
'hsb', 'dsb', 'tk' ],
|
||||
'incomplete': [ 'ro' ]
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
|
||||
SYM,LET,SYM,LET,SYM,LET,LET,SYM,SYM,LET,LET,LET,LET,SYM,LET,LET, # AX
|
||||
SYM,LET,SYM,LET,SYM,LET,LET,SYM,SYM,LET,LET,LET,LET,SYM,LET,LET, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX
|
||||
]
|
||||
75
script/charsets/windows-1250.py
Normal file
75
script/charsets/windows-1250.py
Normal file
@ -0,0 +1,75 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'WINDOWS-1250'
|
||||
aliases = ['cswindows1250']
|
||||
|
||||
language = \
|
||||
{
|
||||
# used under Microsoft Windows to represent texts in Central European and
|
||||
# Eastern European languages that use Latin script, such as Polish, Czech,
|
||||
# Slovak, Hungarian, Slovene, Bosnian, Croatian, Serbian (Latin script),
|
||||
# Romanian (before 1993 spelling reform) and Albanian.
|
||||
'complete': [ 'pl', 'hu', 'sl', 'bs', 'hr', 'sr', 'ro', 'sq', 'de' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM,LET,SYM,LET,LET,LET,LET, # 8X
|
||||
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,LET,SYM,LET,LET,LET,LET, # 9X
|
||||
SYM,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,LET, # AX
|
||||
SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,LET,LET,SYM,LET,SYM,LET,LET, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX
|
||||
]
|
||||
74
script/langs/hu.py
Normal file
74
script/langs/hu.py
Normal file
@ -0,0 +1,74 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import re
|
||||
|
||||
## Mandatory Properties ##
|
||||
|
||||
# The human name for the language, in English.
|
||||
name = 'Hungarian'
|
||||
# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
|
||||
# or use another catalog as a last resort.
|
||||
code = 'hu'
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii = False
|
||||
# The charsets we want to support and create data for.
|
||||
charsets = ['ISO-8859-2', 'WINDOWS-1250']
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
# Alphabet characters: I separate to make missing letters fully visible.
|
||||
alphabet = 'abcdefghijklmnop' + 'rstuv' + 'z' + 'áéíóöőúüű'
|
||||
# The start page. Though optional, it is advised to choose one yourself.
|
||||
start_pages = ['Kezdőlap']
|
||||
# give possibility to select another code for the Wikipedia URL.
|
||||
wikipedia_code = code
|
||||
# 'a' and 'A' will be considered the same character, and so on.
|
||||
# This uses Python algorithm to determine upper/lower-case of a given
|
||||
# character.
|
||||
case_mapping = True
|
||||
|
||||
# A function to clean content returned by the `wikipedia` python lib,
|
||||
# in case some unwanted data has been overlooked.
|
||||
def clean_wikipedia_content(content):
|
||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
||||
r'\2',
|
||||
content)
|
||||
return cleaned
|
||||
Loading…
x
Reference in New Issue
Block a user