BuildLangModel: map different cases of the same character together.

With the new case_mapping lang property, we can consider upper and lower case versions of the same character as one character. This makes sense in some language, and would allow to enter some rarer characters (but still in the main alphabet) inside the frequent character list. For instance 'œ' and 'Œ' in French.
2026-02-06 09:49:59 +08:00 · 2015-11-29 02:14:48 +01:00 · 2015-11-29 02:14:48 +01:00 · 7f290975ba
commit 7f290975ba
parent 00a78faa1d
2 changed files with 14 additions and 0 deletions
--- a/script/BuildLangModel.py
+++ b/script/BuildLangModel.py
@ -102,6 +102,10 @@ if not hasattr(lang, 'wikipedia_code') or lang.wikipedia_code is None:
    lang.wikipedia_code = lang.code
 if not hasattr(lang, 'clean_wikipedia_content') or lang.clean_wikipedia_content is None:
    lang.clean_wikipedia_content = None
+if hasattr(lang, 'case_mapping'):
+    lang.case_mapping = bool(lang.case_mapping)
+else:
+    lang.case_mapping = False

 # Starting processing.
 wikipedia.set_lang(lang.wikipedia_code)
@ -125,6 +129,7 @@ def visit_page(title, depth, clean_text, logfd):
    global sequences
    global prev_char
    global options
+    global lang

    if options.max_page is not None and \
       len(visited_pages) > options.max_page:
@ -145,6 +150,9 @@ def visit_page(title, depth, clean_text, logfd):
    # since they have basically a similar role in the purpose of uchardet.
    content = re.sub(r'\s+', ' ', content)

+    if lang.case_mapping:
+        content = content.lower()
+
    # In python 3, strings are UTF-8.
    # Looping through them return expected characters.
    for char in content:
@ -282,6 +290,8 @@ for charset in charsets:
                CTOM_str += 'NUM,'
            else: # LET
                uchar = bytes([cp]).decode(charset)
+                if lang.case_mapping and uchar.isupper():
+                    uchar = uchar.lower()
                for order, (char, ratio) in enumerate(sorted_ratios):
                    if char == ord(uchar):
                        CTOM_str += '{:3},'.format(order)
--- a/script/langs/fr.py
+++ b/script/langs/fr.py
@ -58,6 +58,10 @@ charsets = ['ISO-8859-15', 'ISO-8859-1']
 start_page = 'Wikipédia:Accueil_principal'
 # give possibility to select another code for the Wikipedia URL.
 wikipedia_code = code
+# 'a' and 'A' will be considered the same character, and so on.
+# This uses Python algorithm to determine upper/lower-case of a given
+# character.
+case_mapping = True

 # A function to clean content returned by the `wikipedia` python lib,
 # in case some unwanted data has been overlooked.