script: fix a bit BuildLangModel.py when use_ascii is True.

In particular, I prepare the case for English detection. I am not pushing actual English models yet, because it's not so efficient yet. I will do when I will be able to handle better English confidence.
2025-12-06 16:56:40 +08:00 · 2021-03-19 18:38:30 +01:00 · 2021-03-19 18:38:30 +01:00 · a98cdcd88f
commit a98cdcd88f
parent 629bc879f3
1 changed files with 8 additions and 3 deletions
--- a/script/BuildLangModel.py
+++ b/script/BuildLangModel.py
@ -218,6 +218,11 @@ def process_text(content, lang):
        if unicode_value in characters:
            characters[unicode_value] += 1
            is_letter = True
+        if lang.use_ascii and \
+           ((unicode_value >= 65 and unicode_value <= 90) or \
+            (unicode_value >= 97 and unicode_value <= 122)):
+          characters[unicode_value] = 1
+          is_letter = True
        elif lang.unicode_ranges is not None:
            for start, end in lang.unicode_ranges:
              if unicode_value >= start and unicode_value <= end:
@ -364,7 +369,7 @@ accumulated_ratios = 0
 # 64 frequent characters depending on the language.
 logfd.write('\nMost Frequent characters:')
 if lang.alphabet is None and lang.frequent_ranges is None:
-    freq_count = 64
+    freq_count = min(64, len(sorted_ratios))
    for order, (char, ratio) in enumerate(sorted_ratios):
        if order >= freq_count:
            break