From a98cdcd88f87a602b7b2569ced11de79b1283d6a Mon Sep 17 00:00:00 2001 From: Jehan Date: Fri, 19 Mar 2021 18:38:30 +0100 Subject: [PATCH] script: fix a bit BuildLangModel.py when use_ascii is True. In particular, I prepare the case for English detection. I am not pushing actual English models yet, because it's not so efficient yet. I will do when I will be able to handle better English confidence. --- script/BuildLangModel.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py index c76e46f..d0d52dc 100755 --- a/script/BuildLangModel.py +++ b/script/BuildLangModel.py @@ -170,8 +170,8 @@ def normalize_codepoint_ranges(input_range): sys.stderr.write("Wrong unicode range: {}-{}.\n".format(start, end)) else: output_range += [(start, end)] - if len(output_range) == 0: - output_range = None + if len(output_range) == 0: + output_range = None return output_range lang.unicode_ranges = normalize_codepoint_ranges(lang.unicode_ranges) @@ -218,6 +218,11 @@ def process_text(content, lang): if unicode_value in characters: characters[unicode_value] += 1 is_letter = True + if lang.use_ascii and \ + ((unicode_value >= 65 and unicode_value <= 90) or \ + (unicode_value >= 97 and unicode_value <= 122)): + characters[unicode_value] = 1 + is_letter = True elif lang.unicode_ranges is not None: for start, end in lang.unicode_ranges: if unicode_value >= start and unicode_value <= end: @@ -364,7 +369,7 @@ accumulated_ratios = 0 # 64 frequent characters depending on the language. logfd.write('\nMost Frequent characters:') if lang.alphabet is None and lang.frequent_ranges is None: - freq_count = 64 + freq_count = min(64, len(sorted_ratios)) for order, (char, ratio) in enumerate(sorted_ratios): if order >= freq_count: break