script: fix a bit BuildLangModel.py when use_ascii is True.

In particular, I prepare the case for English detection. I am not
pushing actual English models yet, because it's not so efficient yet. I
will do when I will be able to handle better English confidence.
This commit is contained in:
Jehan 2021-03-19 18:38:30 +01:00
parent 629bc879f3
commit a98cdcd88f

View File

@ -218,6 +218,11 @@ def process_text(content, lang):
if unicode_value in characters: if unicode_value in characters:
characters[unicode_value] += 1 characters[unicode_value] += 1
is_letter = True is_letter = True
if lang.use_ascii and \
((unicode_value >= 65 and unicode_value <= 90) or \
(unicode_value >= 97 and unicode_value <= 122)):
characters[unicode_value] = 1
is_letter = True
elif lang.unicode_ranges is not None: elif lang.unicode_ranges is not None:
for start, end in lang.unicode_ranges: for start, end in lang.unicode_ranges:
if unicode_value >= start and unicode_value <= end: if unicode_value >= start and unicode_value <= end:
@ -364,7 +369,7 @@ accumulated_ratios = 0
# 64 frequent characters depending on the language. # 64 frequent characters depending on the language.
logfd.write('\nMost Frequent characters:') logfd.write('\nMost Frequent characters:')
if lang.alphabet is None and lang.frequent_ranges is None: if lang.alphabet is None and lang.frequent_ranges is None:
freq_count = 64 freq_count = min(64, len(sorted_ratios))
for order, (char, ratio) in enumerate(sorted_ratios): for order, (char, ratio) in enumerate(sorted_ratios):
if order >= freq_count: if order >= freq_count:
break break