From 7f290975ba77b62969a2cb65be57e7882639f4c9 Mon Sep 17 00:00:00 2001
From: Jehan <jehan@girinstud.io>
Date: Sun, 29 Nov 2015 02:14:48 +0100
Subject: [PATCH] BuildLangModel: map different cases of the same character
 together.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the new case_mapping lang property, we can consider upper and lower
case versions of the same character as one character.
This makes sense in some language, and would allow to enter some rarer
characters (but still in the main alphabet) inside the frequent
character list. For instance 'œ' and 'Œ' in French.
---
 script/BuildLangModel.py | 10 ++++++++++
 script/langs/fr.py       |  4 ++++
 2 files changed, 14 insertions(+)

diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py
index 5fb3a9e..1596f5c 100755
--- a/script/BuildLangModel.py
+++ b/script/BuildLangModel.py
@@ -102,6 +102,10 @@ if not hasattr(lang, 'wikipedia_code') or lang.wikipedia_code is None:
     lang.wikipedia_code = lang.code
 if not hasattr(lang, 'clean_wikipedia_content') or lang.clean_wikipedia_content is None:
     lang.clean_wikipedia_content = None
+if hasattr(lang, 'case_mapping'):
+    lang.case_mapping = bool(lang.case_mapping)
+else:
+    lang.case_mapping = False
 
 # Starting processing.
 wikipedia.set_lang(lang.wikipedia_code)
@@ -125,6 +129,7 @@ def visit_page(title, depth, clean_text, logfd):
     global sequences
     global prev_char
     global options
+    global lang
 
     if options.max_page is not None and \
        len(visited_pages) > options.max_page:
@@ -145,6 +150,9 @@ def visit_page(title, depth, clean_text, logfd):
     # since they have basically a similar role in the purpose of uchardet.
     content = re.sub(r'\s+', ' ', content)
 
+    if lang.case_mapping:
+        content = content.lower()
+
     # In python 3, strings are UTF-8.
     # Looping through them return expected characters.
     for char in content:
@@ -282,6 +290,8 @@ for charset in charsets:
                 CTOM_str += 'NUM,'
             else: # LET
                 uchar = bytes([cp]).decode(charset)
+                if lang.case_mapping and uchar.isupper():
+                    uchar = uchar.lower()
                 for order, (char, ratio) in enumerate(sorted_ratios):
                     if char == ord(uchar):
                         CTOM_str += '{:3},'.format(order)
diff --git a/script/langs/fr.py b/script/langs/fr.py
index f2951a2..602d96c 100644
--- a/script/langs/fr.py
+++ b/script/langs/fr.py
@@ -58,6 +58,10 @@ charsets = ['ISO-8859-15', 'ISO-8859-1']
 start_page = 'Wikipédia:Accueil_principal'
 # give possibility to select another code for the Wikipedia URL.
 wikipedia_code = code
+# 'a' and 'A' will be considered the same character, and so on.
+# This uses Python algorithm to determine upper/lower-case of a given
+# character.
+case_mapping = True
 
 # A function to clean content returned by the `wikipedia` python lib,
 # in case some unwanted data has been overlooked.