From 7f290975ba77b62969a2cb65be57e7882639f4c9 Mon Sep 17 00:00:00 2001 From: Jehan Date: Sun, 29 Nov 2015 02:14:48 +0100 Subject: [PATCH] BuildLangModel: map different cases of the same character together. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the new case_mapping lang property, we can consider upper and lower case versions of the same character as one character. This makes sense in some language, and would allow to enter some rarer characters (but still in the main alphabet) inside the frequent character list. For instance 'œ' and 'Œ' in French. --- script/BuildLangModel.py | 10 ++++++++++ script/langs/fr.py | 4 ++++ 2 files changed, 14 insertions(+) diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py index 5fb3a9e..1596f5c 100755 --- a/script/BuildLangModel.py +++ b/script/BuildLangModel.py @@ -102,6 +102,10 @@ if not hasattr(lang, 'wikipedia_code') or lang.wikipedia_code is None: lang.wikipedia_code = lang.code if not hasattr(lang, 'clean_wikipedia_content') or lang.clean_wikipedia_content is None: lang.clean_wikipedia_content = None +if hasattr(lang, 'case_mapping'): + lang.case_mapping = bool(lang.case_mapping) +else: + lang.case_mapping = False # Starting processing. wikipedia.set_lang(lang.wikipedia_code) @@ -125,6 +129,7 @@ def visit_page(title, depth, clean_text, logfd): global sequences global prev_char global options + global lang if options.max_page is not None and \ len(visited_pages) > options.max_page: @@ -145,6 +150,9 @@ def visit_page(title, depth, clean_text, logfd): # since they have basically a similar role in the purpose of uchardet. content = re.sub(r'\s+', ' ', content) + if lang.case_mapping: + content = content.lower() + # In python 3, strings are UTF-8. # Looping through them return expected characters. for char in content: @@ -282,6 +290,8 @@ for charset in charsets: CTOM_str += 'NUM,' else: # LET uchar = bytes([cp]).decode(charset) + if lang.case_mapping and uchar.isupper(): + uchar = uchar.lower() for order, (char, ratio) in enumerate(sorted_ratios): if char == ord(uchar): CTOM_str += '{:3},'.format(order) diff --git a/script/langs/fr.py b/script/langs/fr.py index f2951a2..602d96c 100644 --- a/script/langs/fr.py +++ b/script/langs/fr.py @@ -58,6 +58,10 @@ charsets = ['ISO-8859-15', 'ISO-8859-1'] start_page = 'Wikipédia:Accueil_principal' # give possibility to select another code for the Wikipedia URL. wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True # A function to clean content returned by the `wikipedia` python lib, # in case some unwanted data has been overlooked.