BuildLangModel: map different cases of the same character together.

With the new case_mapping lang property, we can consider upper and lower
case versions of the same character as one character.
This makes sense in some language, and would allow to enter some rarer
characters (but still in the main alphabet) inside the frequent
character list. For instance 'œ' and 'Œ' in French.
This commit is contained in:
Jehan 2015-11-29 02:14:48 +01:00
parent 00a78faa1d
commit 7f290975ba
2 changed files with 14 additions and 0 deletions

View File

@ -102,6 +102,10 @@ if not hasattr(lang, 'wikipedia_code') or lang.wikipedia_code is None:
lang.wikipedia_code = lang.code
if not hasattr(lang, 'clean_wikipedia_content') or lang.clean_wikipedia_content is None:
lang.clean_wikipedia_content = None
if hasattr(lang, 'case_mapping'):
lang.case_mapping = bool(lang.case_mapping)
else:
lang.case_mapping = False
# Starting processing.
wikipedia.set_lang(lang.wikipedia_code)
@ -125,6 +129,7 @@ def visit_page(title, depth, clean_text, logfd):
global sequences
global prev_char
global options
global lang
if options.max_page is not None and \
len(visited_pages) > options.max_page:
@ -145,6 +150,9 @@ def visit_page(title, depth, clean_text, logfd):
# since they have basically a similar role in the purpose of uchardet.
content = re.sub(r'\s+', ' ', content)
if lang.case_mapping:
content = content.lower()
# In python 3, strings are UTF-8.
# Looping through them return expected characters.
for char in content:
@ -282,6 +290,8 @@ for charset in charsets:
CTOM_str += 'NUM,'
else: # LET
uchar = bytes([cp]).decode(charset)
if lang.case_mapping and uchar.isupper():
uchar = uchar.lower()
for order, (char, ratio) in enumerate(sorted_ratios):
if char == ord(uchar):
CTOM_str += '{:3},'.format(order)

View File

@ -58,6 +58,10 @@ charsets = ['ISO-8859-15', 'ISO-8859-1']
start_page = 'Wikipédia:Accueil_principal'
# give possibility to select another code for the Wikipedia URL.
wikipedia_code = code
# 'a' and 'A' will be considered the same character, and so on.
# This uses Python algorithm to determine upper/lower-case of a given
# character.
case_mapping = True
# A function to clean content returned by the `wikipedia` python lib,
# in case some unwanted data has been overlooked.