BuildLangModel: lowercase only when resulting char has a composed form.

I had the case with the Turkish dotted 'İ' that lowercasing it with Python algorithm returned me a decomposed character that it was not able to recompose. Therefore ord() raised a TypeError because the string length was 2.
2026-01-01 03:12:24 +08:00 · 2015-12-04 01:30:21 +01:00 · 2015-12-04 01:30:21 +01:00 · a167bd5e42
commit a167bd5e42
parent b56a3c7b84
1 changed files with 6 additions and 2 deletions
--- a/script/BuildLangModel.py
+++ b/script/BuildLangModel.py
@ -39,6 +39,7 @@
 # ##### END LICENSE BLOCK #####

 # Third party modules.
+import unicodedata
 import wikipedia
 import importlib
 import optparse
@ -114,7 +115,7 @@ if lang.alphabet is not None:
    if lang.use_ascii:
        lang.alphabet += [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
    if lang.case_mapping:
-        lang.alphabet = list(set([ l.lower() for l in lang.alphabet ]))
+        lang.alphabet = list(set([ l.lower() if len(unicodedata.normalize('NFC', l.lower())) == 1 else l for l in lang.alphabet ]))
    lang.alphabet = list(set(lang.alphabet))

 # Starting processing.
@ -333,7 +334,10 @@ for charset in charsets:
                CTOM_str += 'NUM,'
            else: # LET
                uchar = bytes([cp]).decode(charset)
-                if lang.case_mapping and uchar.isupper():
+                if lang.case_mapping and uchar.isupper() and \
+                   len(unicodedata.normalize('NFC', uchar.lower())) == 1:
+                   # Unless we encounter special cases of characters with no
+                   # composed lowercase, we lowercase it.
                    uchar = uchar.lower()
                for order, (char, ratio) in enumerate(sorted_ratios):
                    if char == ord(uchar):