From 4a579fae0250a3850a5d89b4e8ce513fc0ca145c Mon Sep 17 00:00:00 2001
From: Jehan <jehan@girinstud.io>
Date: Mon, 22 Mar 2021 17:44:06 +0100
Subject: [PATCH] script: generate more complete frequent characters when range
 is set.

The early version used to stop earlier, assuming frequent ranges were
used only for language scripts with a lot of characters (such as Korean,
or even more Japanese or Chinese), hence it was not efficient to keep
data for them all. Since we now use a separate language detector for
CJK, remaining scripts (so far) have a usable range of characters.
Therefore it is much prefered to keep as much data as possible on these.

This allowed to redo the Thai model (cf. previous commit) with more
data, hence get much better language confidence on Thai texts.
---
 script/BuildLangModel.py | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py
index acceb57..2e5e4ed 100755
--- a/script/BuildLangModel.py
+++ b/script/BuildLangModel.py
@@ -119,7 +119,10 @@ if not hasattr(lang, 'alphabet_mapping') or lang.alphabet_mapping is None:
 if not hasattr(lang, 'unicode_ranges') or lang.unicode_ranges is None:
     lang.unicode_ranges = None
 if not hasattr(lang, 'frequent_ranges') or lang.frequent_ranges is None:
-    lang.frequent_ranges = None
+    if lang.unicode_ranges is not None:
+      lang.frequent_ranges = lang.unicode_ranges
+    else:
+      lang.frequent_ranges = None
 
 def local_lowercase(text, lang):
     lowercased = ''
@@ -394,34 +397,28 @@ elif lang.alphabet is not None:
                   "\n       Missing characters: {}".format(", ".join(lang.alphabet)))
             exit(1)
 elif lang.frequent_ranges is not None:
+    # How many characters in the frequent range?
+    frequent_ranges_size = 0
+    for start, end in lang.frequent_ranges:
+      frequent_ranges_size += end - start + 1
+
+    # Keep ratio for at least all the characters inside the frequent
+    # ranges.
     freq_count = 0
-    non_freq_counter = 0
-    non_freq_ratio   = 0
     for order, (char, ratio) in enumerate(sorted_ratios):
       for start, end in lang.frequent_ranges:
         if char >= start and char <= end:
           freq_count += 1
-          non_freq_counter = 0
-          non_freq_ratio   = 0
           accumulated_ratios += ratio
           logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
+          frequent_ranges_size -= 1
           break
       else:
-        if non_freq_counter >= 2:
-          # We don't try to get necessarily the whole range, but break
-          # when we are getting into known non-frequent area.
-          freq_count         -= non_freq_counter
-          accumulated_ratios -= non_freq_ratio
-          break
-        freq_count         += 1
+        # A frequent character in the non-frequent range.
+        logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
+        freq_count += 1
         accumulated_ratios += ratio
-
-        non_freq_counter   += 1
-        non_freq_ratio     += ratio
-      if accumulated_ratios >= 0.99:
-        if non_freq_counter > 0:
-          freq_count         -= non_freq_counter
-          accumulated_ratios -= non_freq_ratio
+      if frequent_ranges_size <= 0:
         break
 
 logfd.write("\n\nThe first {} characters have an accumulated ratio of {}.\n".format(freq_count, accumulated_ratios))