mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-09 18:36:41 +08:00
BuildLangModel: lowercase only when resulting char has a composed form.
I had the case with the Turkish dotted 'İ' that lowercasing it with Python algorithm returned me a decomposed character that it was not able to recompose. Therefore ord() raised a TypeError because the string length was 2.
This commit is contained in:
parent
b56a3c7b84
commit
a167bd5e42
@ -39,6 +39,7 @@
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
# Third party modules.
|
||||
import unicodedata
|
||||
import wikipedia
|
||||
import importlib
|
||||
import optparse
|
||||
@ -114,7 +115,7 @@ if lang.alphabet is not None:
|
||||
if lang.use_ascii:
|
||||
lang.alphabet += [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
|
||||
if lang.case_mapping:
|
||||
lang.alphabet = list(set([ l.lower() for l in lang.alphabet ]))
|
||||
lang.alphabet = list(set([ l.lower() if len(unicodedata.normalize('NFC', l.lower())) == 1 else l for l in lang.alphabet ]))
|
||||
lang.alphabet = list(set(lang.alphabet))
|
||||
|
||||
# Starting processing.
|
||||
@ -333,7 +334,10 @@ for charset in charsets:
|
||||
CTOM_str += 'NUM,'
|
||||
else: # LET
|
||||
uchar = bytes([cp]).decode(charset)
|
||||
if lang.case_mapping and uchar.isupper():
|
||||
if lang.case_mapping and uchar.isupper() and \
|
||||
len(unicodedata.normalize('NFC', uchar.lower())) == 1:
|
||||
# Unless we encounter special cases of characters with no
|
||||
# composed lowercase, we lowercase it.
|
||||
uchar = uchar.lower()
|
||||
for order, (char, ratio) in enumerate(sorted_ratios):
|
||||
if char == ord(uchar):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user