From d76d33b88b461c7af1b5f75aa87fc32a4159a847 Mon Sep 17 00:00:00 2001 From: Jehan Date: Mon, 26 Sep 2016 01:16:55 +0200 Subject: [PATCH] script: character orders in single-byte language models should be maxed. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This happened when building a Croatian model which can be written with many different encodings. There were also many irrelevant glyphs (i.e. used in other languages) in these encodings so we ended with orders over 255, which breaks when converting to unsigned char. Just let's make sure that we don't cross the 250 limit (over is used for controls, illegal characters, symbols, numbers…). This means we may have several characters with order 249, but since orders over the frequent character list don't matter, this is not a problem. --- script/BuildLangModel.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py index d44a5f9..c881147 100755 --- a/script/BuildLangModel.py +++ b/script/BuildLangModel.py @@ -414,10 +414,18 @@ for charset in charsets: uchar = local_lowercase(uchar, lang) for order, (char, ratio) in enumerate(sorted_ratios): if char == ord(uchar): - CTOM_str += '{:3},'.format(order) + CTOM_str += '{:3},'.format(min(249, order)) break else: - CTOM_str += '{:3},'.format(n_char) + # XXX: we must make sure the character order does not go + # over the special characters (250 currently). This may + # actually happen when building a model for a language + # writable with many different encoding. So let's just + # ceil the order value at 249 max. + # It may be an interesting alternative to add another + # constant for any character with an order > freqCharCount. + # Maybe IRR (irrelevant character) or simply CHR. + CTOM_str += '{:3},'.format(min(249, n_char)) n_char += 1 CTOM_str += ' /* {:X}X */'.format(line) CTOM_str += '\n};\n/*'