mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
script: fix a bit BuildLangModel.py when use_ascii is True.
In particular, I prepare the case for English detection. I am not pushing actual English models yet, because it's not so efficient yet. I will do when I will be able to handle better English confidence.
This commit is contained in:
parent
629bc879f3
commit
a98cdcd88f
@ -218,6 +218,11 @@ def process_text(content, lang):
|
||||
if unicode_value in characters:
|
||||
characters[unicode_value] += 1
|
||||
is_letter = True
|
||||
if lang.use_ascii and \
|
||||
((unicode_value >= 65 and unicode_value <= 90) or \
|
||||
(unicode_value >= 97 and unicode_value <= 122)):
|
||||
characters[unicode_value] = 1
|
||||
is_letter = True
|
||||
elif lang.unicode_ranges is not None:
|
||||
for start, end in lang.unicode_ranges:
|
||||
if unicode_value >= start and unicode_value <= end:
|
||||
@ -364,7 +369,7 @@ accumulated_ratios = 0
|
||||
# 64 frequent characters depending on the language.
|
||||
logfd.write('\nMost Frequent characters:')
|
||||
if lang.alphabet is None and lang.frequent_ranges is None:
|
||||
freq_count = 64
|
||||
freq_count = min(64, len(sorted_ratios))
|
||||
for order, (char, ratio) in enumerate(sorted_ratios):
|
||||
if order >= freq_count:
|
||||
break
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user