script: generate more complete frequent characters when range is set.

The early version used to stop earlier, assuming frequent ranges were
used only for language scripts with a lot of characters (such as Korean,
or even more Japanese or Chinese), hence it was not efficient to keep
data for them all. Since we now use a separate language detector for
CJK, remaining scripts (so far) have a usable range of characters.
Therefore it is much prefered to keep as much data as possible on these.

This allowed to redo the Thai model (cf. previous commit) with more
data, hence get much better language confidence on Thai texts.
This commit is contained in:
Jehan 2021-03-22 17:44:06 +01:00
parent 314f062c70
commit 8e2cf7b81b

View File

@ -120,7 +120,10 @@ if not hasattr(lang, 'alphabet_mapping') or lang.alphabet_mapping is None:
if not hasattr(lang, 'unicode_ranges') or lang.unicode_ranges is None: if not hasattr(lang, 'unicode_ranges') or lang.unicode_ranges is None:
lang.unicode_ranges = None lang.unicode_ranges = None
if not hasattr(lang, 'frequent_ranges') or lang.frequent_ranges is None: if not hasattr(lang, 'frequent_ranges') or lang.frequent_ranges is None:
lang.frequent_ranges = None if lang.unicode_ranges is not None:
lang.frequent_ranges = lang.unicode_ranges
else:
lang.frequent_ranges = None
def local_lowercase(text, lang): def local_lowercase(text, lang):
lowercased = '' lowercased = ''
@ -413,34 +416,28 @@ elif lang.alphabet is not None:
"\n Missing characters: {}".format(", ".join(lang.alphabet))) "\n Missing characters: {}".format(", ".join(lang.alphabet)))
exit(1) exit(1)
elif lang.frequent_ranges is not None: elif lang.frequent_ranges is not None:
# How many characters in the frequent range?
frequent_ranges_size = 0
for start, end in lang.frequent_ranges:
frequent_ranges_size += end - start + 1
# Keep ratio for at least all the characters inside the frequent
# ranges.
freq_count = 0 freq_count = 0
non_freq_counter = 0
non_freq_ratio = 0
for order, (char, ratio) in enumerate(sorted_ratios): for order, (char, ratio) in enumerate(sorted_ratios):
for start, end in lang.frequent_ranges: for start, end in lang.frequent_ranges:
if char >= start and char <= end: if char >= start and char <= end:
freq_count += 1 freq_count += 1
non_freq_counter = 0
non_freq_ratio = 0
accumulated_ratios += ratio accumulated_ratios += ratio
logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100)) logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
frequent_ranges_size -= 1
break break
else: else:
if non_freq_counter >= 2: # A frequent character in the non-frequent range.
# We don't try to get necessarily the whole range, but break logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
# when we are getting into known non-frequent area. freq_count += 1
freq_count -= non_freq_counter
accumulated_ratios -= non_freq_ratio
break
freq_count += 1
accumulated_ratios += ratio accumulated_ratios += ratio
if frequent_ranges_size <= 0:
non_freq_counter += 1
non_freq_ratio += ratio
if accumulated_ratios >= 0.99:
if non_freq_counter > 0:
freq_count -= non_freq_counter
accumulated_ratios -= non_freq_ratio
break break
logfd.write("\n\nThe first {} characters have an accumulated ratio of {}.\n".format(freq_count, accumulated_ratios)) logfd.write("\n\nThe first {} characters have an accumulated ratio of {}.\n".format(freq_count, accumulated_ratios))