mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
775 lines
30 KiB
Python
Executable File
775 lines
30 KiB
Python
Executable File
#!/bin/python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# ##### BEGIN LICENSE BLOCK #####
|
|
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
#
|
|
# The contents of this file are subject to the Mozilla Public License Version
|
|
# 1.1 (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
# http://www.mozilla.org/MPL/
|
|
#
|
|
# Software distributed under the License is distributed on an "AS IS" basis,
|
|
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
# for the specific language governing rights and limitations under the
|
|
# License.
|
|
#
|
|
# The Original Code is Mozilla Universal charset detector code.
|
|
#
|
|
# The Initial Developer of the Original Code is
|
|
# Netscape Communications Corporation.
|
|
# Portions created by the Initial Developer are Copyright (C) 2001
|
|
# the Initial Developer. All Rights Reserved.
|
|
#
|
|
# Contributor(s):
|
|
# Jehan <jehan@girinstud.io>
|
|
#
|
|
# Alternatively, the contents of this file may be used under the terms of
|
|
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
# in which case the provisions of the GPL or the LGPL are applicable instead
|
|
# of those above. If you wish to allow use of your version of this file only
|
|
# under the terms of either the GPL or the LGPL, and not to allow others to
|
|
# use your version of this file under the terms of the MPL, indicate your
|
|
# decision by deleting the provisions above and replace them with the notice
|
|
# and other provisions required by the GPL or the LGPL. If you do not delete
|
|
# the provisions above, a recipient may use your version of this file under
|
|
# the terms of any one of the MPL, the GPL or the LGPL.
|
|
#
|
|
# ##### END LICENSE BLOCK #####
|
|
|
|
# Third party modules.
|
|
import unicodedata
|
|
import subprocess
|
|
import wikipedia
|
|
import importlib
|
|
import math
|
|
import optparse
|
|
import datetime
|
|
import operator
|
|
import requests
|
|
import sys
|
|
import re
|
|
import os
|
|
import random
|
|
|
|
# Custom modules.
|
|
import charsets.db
|
|
from charsets.codepoints import *
|
|
|
|
# Command line processing.
|
|
usage = 'Usage: {} <LANG-CODE>\n' \
|
|
'\nEx: `{} fr`'.format(__file__, __file__)
|
|
|
|
description = "Internal tool for uchardet to generate language data."
|
|
cmdline = optparse.OptionParser(usage, description = description)
|
|
cmdline.add_option('--max-page',
|
|
help = 'Maximum number of Wikipedia pages to parse (useful for debugging).',
|
|
action = 'store', type = 'int', dest = 'max_page', default = None)
|
|
cmdline.add_option('--max-depth',
|
|
help = 'Maximum depth when following links from start page (default: 2).',
|
|
action = 'store', type = 'int',
|
|
dest = 'max_depth', default = 2)
|
|
(options, langs) = cmdline.parse_args()
|
|
if len(langs) < 1:
|
|
print("Please select at least one language code.\n")
|
|
exit(1)
|
|
if len(langs) > 1:
|
|
print("This script is meant to generate data for one language at a time.\n")
|
|
exit(1)
|
|
lang = langs[0]
|
|
|
|
# Load the language data.
|
|
sys_path_backup = sys.path
|
|
current_dir = os.path.dirname(os.path.realpath(__file__))
|
|
sys.path = [current_dir + '/langs']
|
|
|
|
try:
|
|
lang = importlib.import_module(lang.lower())
|
|
except ImportError:
|
|
print('Unknown language code "{}": '
|
|
'file "langs/{}.py" does not exist.'.format(lang, lang.lower()))
|
|
exit(1)
|
|
sys.path = sys_path_backup
|
|
|
|
charsets = charsets.db.load(lang.charsets)
|
|
|
|
if not hasattr(lang, 'start_pages') or lang.start_pages is None or \
|
|
lang.start_pages == []:
|
|
# Let's start with the main page, assuming it should have links
|
|
# to relevant pages. In locale wikipedia, this page is usually redirected
|
|
# to a relevant page.
|
|
print("Warning: no `start_pages` set for '{}'. Using ['Main_Page'].\n"
|
|
" If you don't get good data, it is advised to set a "
|
|
"start_pages` variable yourself.".format(lang.code))
|
|
lang.start_pages = ['Main_Page']
|
|
if not hasattr(lang, 'wikipedia_code') or lang.wikipedia_code is None:
|
|
lang.wikipedia_code = lang.code
|
|
if not hasattr(lang, 'clean_wikipedia_content') or lang.clean_wikipedia_content is None:
|
|
lang.clean_wikipedia_content = None
|
|
if hasattr(lang, 'case_mapping'):
|
|
lang.case_mapping = bool(lang.case_mapping)
|
|
else:
|
|
lang.case_mapping = False
|
|
if not hasattr(lang, 'custom_case_mapping'):
|
|
lang.custom_case_mapping = None
|
|
if not hasattr(lang, 'alphabet') or lang.alphabet is None:
|
|
lang.alphabet = None
|
|
if not hasattr(lang, 'alphabet_mapping') or lang.alphabet_mapping is None:
|
|
lang.alphabet_mapping = None
|
|
if not hasattr(lang, 'unicode_ranges') or lang.unicode_ranges is None:
|
|
lang.unicode_ranges = None
|
|
if not hasattr(lang, 'frequent_ranges') or lang.frequent_ranges is None:
|
|
if lang.unicode_ranges is not None:
|
|
lang.frequent_ranges = lang.unicode_ranges
|
|
else:
|
|
lang.frequent_ranges = None
|
|
|
|
def local_lowercase(text, lang):
|
|
lowercased = ''
|
|
for l in text:
|
|
if lang.custom_case_mapping is not None and \
|
|
l in lang.custom_case_mapping:
|
|
lowercased += lang.custom_case_mapping[l]
|
|
elif l.isupper() and \
|
|
lang.case_mapping and \
|
|
len(unicodedata.normalize('NFC', l.lower())) == 1:
|
|
lowercased += l.lower()
|
|
else:
|
|
lowercased += l
|
|
return lowercased
|
|
|
|
if lang.use_ascii:
|
|
if lang.alphabet is None:
|
|
lang.alphabet = [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
|
|
else:
|
|
# Allowing to provide an alphabet in string format rather than list.
|
|
lang.alphabet = list(lang.alphabet)
|
|
lang.alphabet += [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
|
|
if lang.alphabet is not None:
|
|
# Allowing to provide an alphabet in string format rather than list.
|
|
lang.alphabet = list(lang.alphabet)
|
|
if lang.case_mapping or lang.custom_case_mapping is not None:
|
|
lang.alphabet = [local_lowercase(l, lang) for l in lang.alphabet]
|
|
#alphabet = []
|
|
#for l in lang.alphabet:
|
|
#if l.isupper() and \
|
|
#lang.custom_case_mapping is not None and \
|
|
#l in lang.custom_case_mapping:
|
|
#alphabet.append(lang.custom_case_mapping[l])
|
|
#elif l.isupper() and \
|
|
#lang.case_mapping and \
|
|
#len(unicodedata.normalize('NFC', l.lower())) == 1:
|
|
#alphabet.append(l.lower())
|
|
#else:
|
|
#alphabet.append(l)
|
|
lang.alphabet = list(set(lang.alphabet))
|
|
|
|
if lang.alphabet_mapping is not None:
|
|
alphabet_mapping = {}
|
|
for char in lang.alphabet_mapping:
|
|
# Allowing to provide an alphabet in string format rather than list.
|
|
for alt_char in list(lang.alphabet_mapping[char]):
|
|
# While it's easier to write from main character to
|
|
# equivalencies in the language file, we reverse the mapping
|
|
# for simpler usage.
|
|
if lang.case_mapping or lang.custom_case_mapping is not None:
|
|
alphabet_mapping[alt_char] = local_lowercase(char, lang)
|
|
else:
|
|
alphabet_mapping[alt_char] = char
|
|
lang.alphabet_mapping = alphabet_mapping
|
|
|
|
def normalize_codepoint_ranges(input_range):
|
|
output_range = []
|
|
if input_range is not None:
|
|
for start, end in input_range:
|
|
# Allow to write down characters rather than unicode values.
|
|
if isinstance(start, str):
|
|
start = ord(start)
|
|
if isinstance(end, str):
|
|
end = ord(end)
|
|
if not isinstance(start, int) or not isinstance(end, int):
|
|
sys.stderr.write("Expected unicode range in char or int: {}-{}.\n".format(start, end))
|
|
if start > end:
|
|
sys.stderr.write("Wrong unicode range: {}-{}.\n".format(start, end))
|
|
else:
|
|
output_range += [(start, end)]
|
|
if len(output_range) == 0:
|
|
output_range = None
|
|
return output_range
|
|
|
|
lang.unicode_ranges = normalize_codepoint_ranges(lang.unicode_ranges)
|
|
lang.frequent_ranges = normalize_codepoint_ranges(lang.frequent_ranges)
|
|
|
|
# Starting processing.
|
|
wikipedia.set_lang(lang.wikipedia_code)
|
|
|
|
visited_pages = []
|
|
|
|
# The full list of letter characters.
|
|
# The key is the unicode codepoint,
|
|
# and the value is the occurrence count.
|
|
characters = {}
|
|
# Sequence of letters.
|
|
# The key is the couple (char1, char2) in unicode codepoint,
|
|
# the value is the occurrence count.
|
|
sequences = {}
|
|
prev_char = None
|
|
|
|
def process_text(content, lang):
|
|
global charsets
|
|
global characters
|
|
global sequences
|
|
global prev_char
|
|
|
|
if lang.clean_wikipedia_content is not None:
|
|
content = lang.clean_wikipedia_content(content)
|
|
# Clean out the Wikipedia syntax for titles.
|
|
content = re.sub(r'(=+) *([^=]+) *\1',
|
|
r'\2', content)
|
|
# Clean multiple spaces. Newlines and such are normalized to spaces,
|
|
# since they have basically a similar role in the purpose of uchardet.
|
|
content = re.sub(r'\s+', ' ', content)
|
|
|
|
if lang.case_mapping or lang.custom_case_mapping is not None:
|
|
content = local_lowercase(content, lang)
|
|
|
|
# In python 3, strings are UTF-8.
|
|
# Looping through them return expected characters.
|
|
for char in content:
|
|
# Map to main equivalent character.
|
|
if lang.alphabet_mapping is not None and \
|
|
char in lang.alphabet_mapping:
|
|
char = lang.alphabet_mapping[char]
|
|
|
|
unicode_value = ord(char)
|
|
is_letter = False
|
|
if unicode_value in characters:
|
|
characters[unicode_value] += 1
|
|
is_letter = True
|
|
elif lang.unicode_ranges is not None:
|
|
for start, end in lang.unicode_ranges:
|
|
if unicode_value >= start and unicode_value <= end:
|
|
characters[unicode_value] = 1
|
|
is_letter = True
|
|
break
|
|
else:
|
|
# We save the character if it is at least in one of the
|
|
# language encodings and its not a special character.
|
|
for charset in charsets:
|
|
# Does the character exist in the charset?
|
|
try:
|
|
codepoint = char.encode(charset, 'ignore')
|
|
except LookupError:
|
|
# unknown encoding. Use iconv from command line instead.
|
|
try:
|
|
call = subprocess.Popen(['iconv', '-f', 'UTF-8', '-t', charset],
|
|
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
|
stderr=subprocess.DEVNULL)
|
|
if call.poll() is not None:
|
|
(_, error) = call.communicate(input='')
|
|
print('Error: `iconv` ended with error "{}".\n'.format(error))
|
|
exit(1)
|
|
(codepoint, _) = call.communicate(input=char.encode('UTF-8'))
|
|
except FileNotFoundError:
|
|
print('Error: "{}" is not a supported charset by python and `iconv` is not installed.\n')
|
|
exit(1)
|
|
|
|
if codepoint == b'':
|
|
continue
|
|
# ord() is said to return the unicode codepoint.
|
|
# But it turns out it also gives the codepoint for other
|
|
# charsets if I turn the string to encoded bytes first.
|
|
# Not sure if that is a bug or expected.
|
|
codepoint = ord(codepoint)
|
|
if charsets[charset].charmap[codepoint] == LET:
|
|
characters[unicode_value] = 1
|
|
is_letter = True
|
|
break
|
|
if is_letter:
|
|
if prev_char is not None:
|
|
if (prev_char, unicode_value) in sequences:
|
|
sequences[(prev_char, unicode_value)] += 1
|
|
else:
|
|
sequences[(prev_char, unicode_value)] = 1
|
|
prev_char = unicode_value
|
|
else:
|
|
prev_char = None
|
|
|
|
def visit_pages(titles, depth, lang, logfd):
|
|
global visited_pages
|
|
global options
|
|
|
|
if len(titles) == 0:
|
|
return
|
|
|
|
next_titles = []
|
|
if options.max_page is not None:
|
|
max_titles = int(options.max_page/(options.max_depth * options.max_depth))
|
|
else:
|
|
max_titles = sys.maxsize
|
|
for title in titles:
|
|
if options.max_page is not None and \
|
|
len(visited_pages) > options.max_page:
|
|
return
|
|
if title in visited_pages:
|
|
continue
|
|
|
|
# Ugly hack skipping internal pages
|
|
if 'wiki' in title or 'Wiki' in title:
|
|
print('Skipping', title)
|
|
continue
|
|
|
|
visited_pages += [title]
|
|
try:
|
|
page = wikipedia.page(title, auto_suggest=False)
|
|
except (wikipedia.exceptions.PageError,
|
|
wikipedia.exceptions.DisambiguationError) as error:
|
|
# Let's just discard a page when I get an exception.
|
|
print("Discarding page {}: {}\n".format(title, error))
|
|
continue
|
|
logfd.write("\n{} (revision {})".format(title, page.revision_id))
|
|
logfd.flush()
|
|
|
|
process_text(page.content, lang)
|
|
try:
|
|
links = page.links
|
|
random.shuffle(links)
|
|
if len(links) > max_titles:
|
|
links = links[:max_titles]
|
|
next_titles += links
|
|
except KeyError:
|
|
pass
|
|
|
|
if depth >= options.max_depth:
|
|
return
|
|
|
|
random.shuffle(next_titles)
|
|
visit_pages (next_titles, depth + 1, lang, logfd)
|
|
|
|
language_c = lang.name.replace('-', '_').title()
|
|
build_log = current_dir + '/BuildLangModelLogs/Lang{}Model.log'.format(language_c)
|
|
logfd = open(build_log, 'w')
|
|
logfd.write('= Logs of language model for {} ({}) =\n'.format(lang.name, lang.code))
|
|
logfd.write('\n- Generated by {}'.format(os.path.basename(__file__)))
|
|
logfd.write('\n- Started: {}'.format(str(datetime.datetime.now())))
|
|
logfd.write('\n- Maximum depth: {}'.format(options.max_depth))
|
|
if options.max_page is not None:
|
|
logfd.write('\n- Max number of pages: {}'.format(options.max_page))
|
|
logfd.write('\n\n== Parsed pages ==\n')
|
|
logfd.flush()
|
|
try:
|
|
visit_pages(lang.start_pages, 0, lang, logfd)
|
|
except requests.exceptions.ConnectionError:
|
|
print('Error: connection to Wikipedia failed. Aborting\n')
|
|
exit(1)
|
|
logfd.write('\n\n== End of Parsed pages ==')
|
|
logfd.write('\n\n- Wikipedia parsing ended at: {}\n'.format(str(datetime.datetime.now())))
|
|
logfd.flush()
|
|
|
|
########### CHARACTERS ###########
|
|
|
|
# Character ratios.
|
|
ratios = {}
|
|
n_char = len(characters)
|
|
occurrences = sum(characters.values())
|
|
|
|
logfd.write("\n{} characters appeared {} times.\n".format(n_char, occurrences))
|
|
for char in characters:
|
|
ratios[char] = characters[char] / occurrences
|
|
#logfd.write("Character '{}' usage: {} ({} %)\n".format(chr(char),
|
|
# characters[char],
|
|
# ratios[char] * 100))
|
|
|
|
sorted_ratios = sorted(ratios.items(), key=operator.itemgetter(1),
|
|
reverse=True)
|
|
# Accumulated ratios of the frequent chars.
|
|
accumulated_ratios = 0
|
|
|
|
# If there is no alphabet defined, we just use the first 64 letters, which was
|
|
# the original default.
|
|
# If there is an alphabet, we make sure all the alphabet characters are in the
|
|
# frequent list, and we stop then. There may therefore be more or less than
|
|
# 64 frequent characters depending on the language.
|
|
logfd.write('\nMost Frequent characters:')
|
|
very_freq_count = 0
|
|
very_freq_ratio = 0
|
|
if lang.alphabet is None and lang.frequent_ranges is None:
|
|
freq_count = min(64, len(sorted_ratios))
|
|
for order, (char, ratio) in enumerate(sorted_ratios):
|
|
if order >= freq_count:
|
|
break
|
|
logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
|
|
accumulated_ratios += ratio
|
|
if very_freq_ratio < 0.4:
|
|
very_freq_count += 1
|
|
very_freq_ratio += ratio
|
|
elif lang.alphabet is not None:
|
|
freq_count = 0
|
|
for order, (char, ratio) in enumerate(sorted_ratios):
|
|
if len(lang.alphabet) == 0:
|
|
break
|
|
if chr(char) in lang.alphabet:
|
|
lang.alphabet.remove(chr(char))
|
|
logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
|
|
accumulated_ratios += ratio
|
|
freq_count += 1
|
|
if very_freq_ratio < 0.4:
|
|
very_freq_count += 1
|
|
very_freq_ratio += ratio
|
|
else:
|
|
if len(lang.alphabet) > 0:
|
|
print("Error: alphabet characters are absent from data collection"
|
|
"\n Please check the configuration or the data."
|
|
"\n Missing characters: {}".format(", ".join(lang.alphabet)))
|
|
exit(1)
|
|
elif lang.frequent_ranges is not None:
|
|
# How many characters in the frequent range?
|
|
frequent_ranges_size = 0
|
|
for start, end in lang.frequent_ranges:
|
|
frequent_ranges_size += end - start + 1
|
|
|
|
# Keep ratio for at least all the characters inside the frequent
|
|
# ranges.
|
|
freq_count = 0
|
|
for order, (char, ratio) in enumerate(sorted_ratios):
|
|
for start, end in lang.frequent_ranges:
|
|
if char >= start and char <= end:
|
|
freq_count += 1
|
|
accumulated_ratios += ratio
|
|
logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
|
|
frequent_ranges_size -= 1
|
|
break
|
|
else:
|
|
# A frequent character in the non-frequent range.
|
|
logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
|
|
freq_count += 1
|
|
accumulated_ratios += ratio
|
|
|
|
if very_freq_ratio < 0.4:
|
|
very_freq_count += 1
|
|
very_freq_ratio += ratio
|
|
|
|
if frequent_ranges_size <= 0:
|
|
break
|
|
|
|
low_freq_order = freq_count - 1
|
|
low_freq_ratio = 0
|
|
for back_order, (char, ratio) in enumerate(reversed(sorted_ratios[:freq_count])):
|
|
if low_freq_ratio < 0.03:
|
|
low_freq_ratio += ratio
|
|
low_freq_order -= 1
|
|
else:
|
|
break
|
|
|
|
logfd.write("\n\nThe first {} characters have an accumulated ratio of {}.\n".format(freq_count, accumulated_ratios))
|
|
logfd.write("The first {} characters have an accumulated ratio of {}.\n".format(very_freq_count, very_freq_ratio))
|
|
logfd.write("All characters whose order is over {} have an accumulated ratio of {}.\n".format(low_freq_order, low_freq_ratio))
|
|
|
|
with open(current_dir + '/header-template.cpp', 'r') as header_fd:
|
|
c_code = header_fd.read()
|
|
|
|
c_code += '\n/********* Language model for: {} *********/\n\n'.format(lang.name)
|
|
c_code += '/**\n * Generated by {}\n'.format(os.path.basename(__file__))
|
|
c_code += ' * On: {}\n'.format(str(datetime.datetime.now()))
|
|
c_code += ' **/\n'
|
|
|
|
c_code += \
|
|
"""
|
|
/* Character Mapping Table:
|
|
* ILL: illegal character.
|
|
* CTR: control character specific to the charset.
|
|
* RET: carriage/return.
|
|
* SYM: symbol (punctuation) that does not belong to word.
|
|
* NUM: 0 - 9.
|
|
*
|
|
* Other characters are ordered by probabilities
|
|
* (0 is the most common character in the language).
|
|
*
|
|
* Orders are generic to a language. So the codepoint with order X in
|
|
* CHARSET1 maps to the same character as the codepoint with the same
|
|
* order X in CHARSET2 for the same language.
|
|
* As such, it is possible to get missing order. For instance the
|
|
* ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
|
|
* even though they are both used for French. Same for the euro sign.
|
|
*/
|
|
"""
|
|
|
|
for charset in charsets:
|
|
charset_c = charset.replace('-', '_').title()
|
|
CTOM_str = 'static const unsigned char {}_CharToOrderMap[]'.format(charset_c)
|
|
CTOM_str += ' =\n{'
|
|
for line in range(0, 16):
|
|
CTOM_str += '\n '
|
|
for column in range(0, 16):
|
|
cp = line * 16 + column
|
|
cp_type = charsets[charset].charmap[cp]
|
|
if cp_type == ILL:
|
|
CTOM_str += 'ILL,'
|
|
elif cp_type == RET:
|
|
CTOM_str += 'RET,'
|
|
elif cp_type == CTR:
|
|
CTOM_str += 'CTR,'
|
|
elif cp_type == SYM:
|
|
CTOM_str += 'SYM,'
|
|
elif cp_type == NUM:
|
|
CTOM_str += 'NUM,'
|
|
else: # LET
|
|
try:
|
|
uchar = bytes([cp]).decode(charset)
|
|
except UnicodeDecodeError:
|
|
print('Unknown character 0X{:X} in {}.'.format(cp, charset))
|
|
print('Please verify your charset specification.\n')
|
|
exit(1)
|
|
except LookupError:
|
|
# Unknown encoding. Use iconv instead.
|
|
try:
|
|
call = subprocess.Popen(['iconv', '-t', 'UTF-8', '-f', charset],
|
|
stdin=subprocess.PIPE,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE)
|
|
if call.poll() is not None:
|
|
(_, error) = call.communicate(input='')
|
|
print('Error: `iconv` ended with error "{}".\n'.format(error))
|
|
exit(1)
|
|
(uchar, _) = call.communicate(input=bytes([cp]))
|
|
uchar = uchar.decode('UTF-8')
|
|
except FileNotFoundError:
|
|
print('Error: "{}" is not a supported charset by python and `iconv` is not installed.\n')
|
|
exit(1)
|
|
#if lang.case_mapping and uchar.isupper() and \
|
|
#len(unicodedata.normalize('NFC', uchar.lower())) == 1:
|
|
# Unless we encounter special cases of characters with no
|
|
# composed lowercase, we lowercase it.
|
|
if lang.case_mapping or lang.custom_case_mapping is not None:
|
|
uchar = local_lowercase(uchar, lang)
|
|
if lang.alphabet_mapping is not None and uchar in lang.alphabet_mapping:
|
|
uchar = lang.alphabet_mapping[uchar]
|
|
for order, (char, ratio) in enumerate(sorted_ratios):
|
|
if char == ord(uchar):
|
|
CTOM_str += '{:3},'.format(min(249, order))
|
|
break
|
|
else:
|
|
# XXX: we must make sure the character order does not go
|
|
# over the special characters (250 currently). This may
|
|
# actually happen when building a model for a language
|
|
# writable with many different encoding. So let's just
|
|
# ceil the order value at 249 max.
|
|
# It may be an interesting alternative to add another
|
|
# constant for any character with an order > freqCharCount.
|
|
# Maybe IRR (irrelevant character) or simply CHR.
|
|
CTOM_str += '{:3},'.format(min(249, n_char))
|
|
n_char += 1
|
|
CTOM_str += ' /* {:X}X */'.format(line)
|
|
CTOM_str += '\n};\n/*'
|
|
CTOM_str += 'X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF'
|
|
CTOM_str += ' */\n\n'
|
|
c_code += CTOM_str
|
|
|
|
## UNICODE frequency.
|
|
|
|
# Since we can't map the full character table from encoding to order,
|
|
# just create a list from the most common characters from the language.
|
|
# The list is ordered by unicode code points (hence can be used
|
|
# generically for various encoding scheme as it is not encoding
|
|
# specific) allowing to search from code points efficiently by a divide
|
|
# and conqueer search algorithm.
|
|
# Each code point is immediately followed by its order.
|
|
|
|
# Keep the freq_count more frequent characters.
|
|
sorted_chars = [(char, freq, order) for order, (char, freq) in
|
|
enumerate(sorted_ratios)][:freq_count]
|
|
max_order = len(sorted_chars)
|
|
|
|
# Add equivalency characters.
|
|
equivalent = []
|
|
if lang.case_mapping:
|
|
for char, ratio, order in sorted_chars:
|
|
uppercased = chr(char).upper()
|
|
try:
|
|
if char != ord(uppercased):
|
|
equivalent += [(ord(uppercased), ratio, order)]
|
|
except TypeError:
|
|
# This happens for some case such as 'SS' as uppercase of 'ß'.
|
|
# Just ignore such cases.
|
|
sys.stderr.write("Ignoring '{}' as uppercase equivalent of '{}'.\n".format(uppercased, char))
|
|
|
|
if lang.alphabet_mapping is not None:
|
|
for alt_c in lang.alphabet_mapping:
|
|
for char, ratio, order in sorted_chars:
|
|
if alt_c == chr(char):
|
|
sys.stderr.write("ALREADY {}\n".format(alt_c))
|
|
exit(1)
|
|
elif char == ord(lang.alphabet_mapping[alt_c]):
|
|
equivalent += [(ord(alt_c), ratio, order)]
|
|
break
|
|
else:
|
|
sys.stderr.write("Base equivalent for {} not found in frequent characters!\n".format(alt_c))
|
|
exit(1)
|
|
|
|
sorted_chars += equivalent
|
|
|
|
# Order by code point.
|
|
sorted_chars = sorted(sorted_chars, key=operator.itemgetter(0))
|
|
|
|
CTOM_str = 'static const int Unicode_Char_size = {};\n'.format(len(sorted_chars))
|
|
|
|
CTOM_str += 'static const unsigned int Unicode_CharOrder[]'
|
|
CTOM_str += ' =\n{'
|
|
column = 0
|
|
|
|
max_char_width = math.floor(math.log10(sorted_chars[-1][0])) + 1
|
|
max_order_width = math.floor(math.log10(max_order)) + 1
|
|
|
|
for char, ratio, order in sorted_chars:
|
|
if column % 8 == 0:
|
|
CTOM_str += '\n '
|
|
column += 1
|
|
CTOM_str += '{}{:>{width}}, '.format('' if column % 8 == 0 else ' ', char, width=max_char_width)
|
|
CTOM_str += '{:>{width}},'.format(order, width=max_order_width)
|
|
|
|
CTOM_str += '\n};\n\n'
|
|
c_code += CTOM_str
|
|
|
|
########### SEQUENCES ###########
|
|
|
|
ratios = {}
|
|
occurrences = sum(sequences.values())
|
|
|
|
accumulated_seq_count = 0
|
|
order_3 = -1
|
|
order_2 = -1
|
|
ratio_3 = -1
|
|
ratio_2 = -1
|
|
count_512 = -1
|
|
count_1024 = -1
|
|
sorted_seqs = sorted(sequences.items(), key=operator.itemgetter(1),
|
|
reverse=True)
|
|
for order, ((c1, c2), count) in enumerate(sorted_seqs):
|
|
accumulated_seq_count += count
|
|
if order_3 == -1 and accumulated_seq_count / occurrences >= 0.995:
|
|
order_3 = order
|
|
ratio_3 = accumulated_seq_count / occurrences
|
|
elif order_2 == -1 and accumulated_seq_count / occurrences >= 0.999:
|
|
order_2 = order
|
|
ratio_2 = accumulated_seq_count / occurrences
|
|
if order < 512:
|
|
count_512 += count
|
|
elif order < 1024:
|
|
count_1024 += count
|
|
|
|
if order_3 != -1 and order_2 != -1:
|
|
break
|
|
|
|
if order_3 == -1 or order_2 == -1:
|
|
# This would probably never happens. It would require a language with
|
|
# very few possible sequences and each of the sequences are widely
|
|
# used. Just add this code for completio, but it won't likely ever be
|
|
# run.
|
|
order_2 = 512
|
|
order_3 = 1024
|
|
ratio_2 = count_512 / occurrences
|
|
ratio_3 = count_1024 / occurrences
|
|
|
|
logfd.write("\n{} sequences found.\n".format(len(sorted_seqs)))
|
|
|
|
c_code += """
|
|
/* Model Table:
|
|
* Total considered sequences: {} / {}
|
|
* - Positive sequences: first {} ({})
|
|
* - Probable sequences: next {} ({}-{}) ({})
|
|
* - Neutral sequences: last {} ({})
|
|
* - Negative sequences: {} (off-ratio)
|
|
* Negative sequences: TODO""".format(len(sorted_seqs),
|
|
freq_count * freq_count,
|
|
order_3, ratio_3,
|
|
order_2 - order_3,
|
|
order_2, order_3,
|
|
ratio_2 - ratio_3,
|
|
freq_count * freq_count - order_2,
|
|
1 - ratio_2,
|
|
freq_count * freq_count - len(sorted_seqs))
|
|
|
|
logfd.write("\nFirst {} (typical positive ratio): {}".format(order_3, ratio_3))
|
|
logfd.write("\nNext {} ({}-{}): {}".format(order_2 - order_3,
|
|
order_2, order_3,
|
|
ratio_2 - ratio_3))
|
|
logfd.write("\nRest: {}".format(1 - ratio_2))
|
|
|
|
c_code += "\n */\n"
|
|
|
|
LM_str = 'static const PRUint8 {}LangModel[]'.format(language_c)
|
|
LM_str += ' =\n{'
|
|
for line in range(0, freq_count):
|
|
LM_str += '\n '
|
|
for column in range(0, freq_count):
|
|
# Let's not make too long lines.
|
|
if freq_count > 40 and column == int(freq_count / 2):
|
|
LM_str += '\n '
|
|
first_order = int(line)
|
|
second_order = column
|
|
if first_order < len(sorted_ratios) and second_order < len(sorted_ratios):
|
|
(first_char, _) = sorted_ratios[first_order]
|
|
(second_char, _) = sorted_ratios[second_order]
|
|
if (first_char, second_char) in sequences:
|
|
for order, (seq, _) in enumerate(sorted_seqs):
|
|
if seq == (first_char, second_char):
|
|
if order < order_3:
|
|
LM_str += '3,'
|
|
elif order < order_2:
|
|
LM_str += '2,'
|
|
else:
|
|
LM_str += '1,'
|
|
break
|
|
else:
|
|
pass # impossible!
|
|
LM_str += '0,'
|
|
else:
|
|
LM_str += '0,'
|
|
else:
|
|
# It may indeed happen that we find less than 64 letters used for a
|
|
# given language.
|
|
LM_str += '0,'
|
|
LM_str += '\n};\n'
|
|
c_code += LM_str
|
|
|
|
for charset in charsets:
|
|
charset_c = charset.replace('-', '_').title()
|
|
SM_str = '\n\nconst SequenceModel {}{}Model ='.format(charset_c, language_c)
|
|
SM_str += '\n{\n '
|
|
SM_str += '{}_CharToOrderMap,\n {}LangModel,'.format(charset_c, language_c)
|
|
SM_str += '\n {},'.format(freq_count)
|
|
SM_str += '\n (float){},'.format(ratio_2)
|
|
SM_str += '\n {},'.format('PR_TRUE' if lang.use_ascii else 'PR_FALSE')
|
|
SM_str += '\n "{}",'.format(charset)
|
|
SM_str += '\n "{}"'.format(lang.code)
|
|
SM_str += '\n};'
|
|
c_code += SM_str
|
|
|
|
SM_str = '\n\nconst LanguageModel {}Model ='.format(language_c)
|
|
SM_str += '\n{'
|
|
SM_str += '\n "{}",'.format(lang.code)
|
|
SM_str += '\n Unicode_CharOrder,'
|
|
SM_str += '\n {},'.format(len(sorted_chars)) # Order is wrong!
|
|
SM_str += '\n {}LangModel,'.format(language_c)
|
|
SM_str += '\n {},'.format(freq_count)
|
|
SM_str += '\n {},'.format(very_freq_count)
|
|
SM_str += '\n (float){},'.format(very_freq_ratio)
|
|
SM_str += '\n {},'.format(low_freq_order)
|
|
SM_str += '\n (float){},'.format(low_freq_ratio)
|
|
SM_str += '\n};'
|
|
c_code += SM_str
|
|
|
|
c_code += '\n'
|
|
|
|
lang_model_file = current_dir + '/../src/LangModels/Lang{}Model.cpp'.format(language_c)
|
|
with open(lang_model_file, 'w') as cpp_fd:
|
|
cpp_fd.write(c_code)
|
|
|
|
logfd.write('\n\n- Processing end: {}\n'.format(str(datetime.datetime.now())))
|
|
logfd.close()
|
|
|
|
print("The following language model file has been generated: {}"
|
|
"\nThe build log is available in: {}"
|
|
"\nTest them and commit them.".format(lang_model_file, build_log))
|