mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-07 01:06:40 +08:00
script: fix a bit BuildLangModel.py when use_ascii is True.
In particular, I prepare the case for English detection. I am not pushing actual English models yet, because it's not so efficient yet. I will do when I will be able to handle better English confidence.
This commit is contained in:
parent
629bc879f3
commit
a98cdcd88f
@ -218,6 +218,11 @@ def process_text(content, lang):
|
|||||||
if unicode_value in characters:
|
if unicode_value in characters:
|
||||||
characters[unicode_value] += 1
|
characters[unicode_value] += 1
|
||||||
is_letter = True
|
is_letter = True
|
||||||
|
if lang.use_ascii and \
|
||||||
|
((unicode_value >= 65 and unicode_value <= 90) or \
|
||||||
|
(unicode_value >= 97 and unicode_value <= 122)):
|
||||||
|
characters[unicode_value] = 1
|
||||||
|
is_letter = True
|
||||||
elif lang.unicode_ranges is not None:
|
elif lang.unicode_ranges is not None:
|
||||||
for start, end in lang.unicode_ranges:
|
for start, end in lang.unicode_ranges:
|
||||||
if unicode_value >= start and unicode_value <= end:
|
if unicode_value >= start and unicode_value <= end:
|
||||||
@ -364,7 +369,7 @@ accumulated_ratios = 0
|
|||||||
# 64 frequent characters depending on the language.
|
# 64 frequent characters depending on the language.
|
||||||
logfd.write('\nMost Frequent characters:')
|
logfd.write('\nMost Frequent characters:')
|
||||||
if lang.alphabet is None and lang.frequent_ranges is None:
|
if lang.alphabet is None and lang.frequent_ranges is None:
|
||||||
freq_count = 64
|
freq_count = min(64, len(sorted_ratios))
|
||||||
for order, (char, ratio) in enumerate(sorted_ratios):
|
for order, (char, ratio) in enumerate(sorted_ratios):
|
||||||
if order >= freq_count:
|
if order >= freq_count:
|
||||||
break
|
break
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user