mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
script: fix a bit BuildLangModel.py when use_ascii is True.
In particular, I prepare the case for English detection. I am not pushing actual English models yet, because it's not so efficient yet. I will do when I will be able to handle better English confidence.
This commit is contained in:
parent
629bc879f3
commit
a98cdcd88f
@ -170,8 +170,8 @@ def normalize_codepoint_ranges(input_range):
|
|||||||
sys.stderr.write("Wrong unicode range: {}-{}.\n".format(start, end))
|
sys.stderr.write("Wrong unicode range: {}-{}.\n".format(start, end))
|
||||||
else:
|
else:
|
||||||
output_range += [(start, end)]
|
output_range += [(start, end)]
|
||||||
if len(output_range) == 0:
|
if len(output_range) == 0:
|
||||||
output_range = None
|
output_range = None
|
||||||
return output_range
|
return output_range
|
||||||
|
|
||||||
lang.unicode_ranges = normalize_codepoint_ranges(lang.unicode_ranges)
|
lang.unicode_ranges = normalize_codepoint_ranges(lang.unicode_ranges)
|
||||||
@ -218,6 +218,11 @@ def process_text(content, lang):
|
|||||||
if unicode_value in characters:
|
if unicode_value in characters:
|
||||||
characters[unicode_value] += 1
|
characters[unicode_value] += 1
|
||||||
is_letter = True
|
is_letter = True
|
||||||
|
if lang.use_ascii and \
|
||||||
|
((unicode_value >= 65 and unicode_value <= 90) or \
|
||||||
|
(unicode_value >= 97 and unicode_value <= 122)):
|
||||||
|
characters[unicode_value] = 1
|
||||||
|
is_letter = True
|
||||||
elif lang.unicode_ranges is not None:
|
elif lang.unicode_ranges is not None:
|
||||||
for start, end in lang.unicode_ranges:
|
for start, end in lang.unicode_ranges:
|
||||||
if unicode_value >= start and unicode_value <= end:
|
if unicode_value >= start and unicode_value <= end:
|
||||||
@ -364,7 +369,7 @@ accumulated_ratios = 0
|
|||||||
# 64 frequent characters depending on the language.
|
# 64 frequent characters depending on the language.
|
||||||
logfd.write('\nMost Frequent characters:')
|
logfd.write('\nMost Frequent characters:')
|
||||||
if lang.alphabet is None and lang.frequent_ranges is None:
|
if lang.alphabet is None and lang.frequent_ranges is None:
|
||||||
freq_count = 64
|
freq_count = min(64, len(sorted_ratios))
|
||||||
for order, (char, ratio) in enumerate(sorted_ratios):
|
for order, (char, ratio) in enumerate(sorted_ratios):
|
||||||
if order >= freq_count:
|
if order >= freq_count:
|
||||||
break
|
break
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user