From d76d33b88b461c7af1b5f75aa87fc32a4159a847 Mon Sep 17 00:00:00 2001
From: Jehan <jehan@girinstud.io>
Date: Mon, 26 Sep 2016 01:16:55 +0200
Subject: [PATCH] script: character orders in single-byte language models
 should be maxed.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This happened when building a Croatian model which can be written with
many different encodings. There were also many irrelevant glyphs (i.e.
used in other languages) in these encodings so we ended with orders over
255, which breaks when converting to unsigned char.
Just let's make sure that we don't cross the 250 limit (over is used for
controls, illegal characters, symbols, numbers…). This means we may have
several characters with order 249, but since orders over the frequent
character list don't matter, this is not a problem.
---
 script/BuildLangModel.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py
index d44a5f9..c881147 100755
--- a/script/BuildLangModel.py
+++ b/script/BuildLangModel.py
@@ -414,10 +414,18 @@ for charset in charsets:
                     uchar = local_lowercase(uchar, lang)
                 for order, (char, ratio) in enumerate(sorted_ratios):
                     if char == ord(uchar):
-                        CTOM_str += '{:3},'.format(order)
+                        CTOM_str += '{:3},'.format(min(249, order))
                         break
                 else:
-                    CTOM_str += '{:3},'.format(n_char)
+                    # XXX: we must make sure the character order does not go
+                    # over the special characters (250 currently). This may
+                    # actually happen when building a model for a language
+                    # writable with many different encoding. So let's just
+                    # ceil the order value at 249 max.
+                    # It may be an interesting alternative to add another
+                    # constant for any character with an order > freqCharCount.
+                    # Maybe IRR (irrelevant character) or simply CHR.
+                    CTOM_str += '{:3},'.format(min(249, n_char))
                     n_char += 1
         CTOM_str += ' /* {:X}X */'.format(line)
     CTOM_str += '\n};\n/*'