BuildLangModel: add concept of custom_case_mapping…

… for langs for which Python lower() algorithm fails.
In particular Turkish dotted/dotless 'i' does not follow same rules
as common western languages.
Lowercase for 'I' is indeed not 'i' but 'ı'.
Uppercase for 'i' is indeed not 'I' but 'İ'.
This commit is contained in:
Jehan 2015-12-04 02:29:40 +01:00
parent f0e122b506
commit 22b9ed2d4f

View File

@ -108,14 +108,42 @@ if hasattr(lang, 'case_mapping'):
lang.case_mapping = bool(lang.case_mapping) lang.case_mapping = bool(lang.case_mapping)
else: else:
lang.case_mapping = False lang.case_mapping = False
if not hasattr(lang, 'custom_case_mapping'):
lang.custom_case_mapping = None
if not hasattr(lang, 'alphabet') or lang.alphabet is None: if not hasattr(lang, 'alphabet') or lang.alphabet is None:
lang.alphabet = None lang.alphabet = None
def local_lowercase(text, lang):
lowercased = ''
for l in text:
if lang.custom_case_mapping is not None and \
l in lang.custom_case_mapping:
lowercased += lang.custom_case_mapping[l]
elif l.isupper() and \
lang.case_mapping and \
len(unicodedata.normalize('NFC', l.lower())) == 1:
lowercased += l.lower()
else:
lowercased += l
return lowercased
if lang.alphabet is not None: if lang.alphabet is not None:
if lang.use_ascii: if lang.use_ascii:
lang.alphabet += [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)] lang.alphabet += [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
if lang.case_mapping: if lang.case_mapping or lang.custom_case_mapping is not None:
lang.alphabet = list(set([ l.lower() if len(unicodedata.normalize('NFC', l.lower())) == 1 else l for l in lang.alphabet ])) lang.alphabet = [local_lowercase(l, lang) for l in lang.alphabet]
#alphabet = []
#for l in lang.alphabet:
#if l.isupper() and \
#lang.custom_case_mapping is not None and \
#l in lang.custom_case_mapping:
#alphabet.append(lang.custom_case_mapping[l])
#elif l.isupper() and \
#lang.case_mapping and \
#len(unicodedata.normalize('NFC', l.lower())) == 1:
#alphabet.append(l.lower())
#else:
#alphabet.append(l)
lang.alphabet = list(set(lang.alphabet)) lang.alphabet = list(set(lang.alphabet))
# Starting processing. # Starting processing.
@ -133,20 +161,20 @@ characters = {}
sequences = {} sequences = {}
prev_char = None prev_char = None
def process_text(text, clean_text, case_mapping): def process_text(text, lang):
global charsets global charsets
global characters global characters
global sequences global sequences
global prev_char global prev_char
if clean_text is not None: if lang.clean_wikipedia_content is not None:
content = clean_text(text) content = lang.clean_wikipedia_content(text)
# Clean multiple spaces. Newlines and such are normalized to spaces, # Clean multiple spaces. Newlines and such are normalized to spaces,
# since they have basically a similar role in the purpose of uchardet. # since they have basically a similar role in the purpose of uchardet.
content = re.sub(r'\s+', ' ', content) content = re.sub(r'\s+', ' ', content)
if case_mapping: if lang.case_mapping or lang.custom_case_mapping is not None:
content = content.lower() content = local_lowercase(content, lang)
# In python 3, strings are UTF-8. # In python 3, strings are UTF-8.
# Looping through them return expected characters. # Looping through them return expected characters.
@ -206,9 +234,7 @@ def visit_pages(titles, depth, lang, logfd):
continue continue
logfd.write("\n{} (revision {})".format(title, page.revision_id)) logfd.write("\n{} (revision {})".format(title, page.revision_id))
process_text(page.content, process_text(page.content, lang)
lang.clean_wikipedia_content,
lang.case_mapping)
next_titles += page.links next_titles += page.links
if depth >= options.max_depth: if depth >= options.max_depth:
@ -334,11 +360,12 @@ for charset in charsets:
CTOM_str += 'NUM,' CTOM_str += 'NUM,'
else: # LET else: # LET
uchar = bytes([cp]).decode(charset) uchar = bytes([cp]).decode(charset)
if lang.case_mapping and uchar.isupper() and \ #if lang.case_mapping and uchar.isupper() and \
len(unicodedata.normalize('NFC', uchar.lower())) == 1: #len(unicodedata.normalize('NFC', uchar.lower())) == 1:
# Unless we encounter special cases of characters with no # Unless we encounter special cases of characters with no
# composed lowercase, we lowercase it. # composed lowercase, we lowercase it.
uchar = uchar.lower() if lang.case_mapping or lang.custom_case_mapping is not None:
uchar = local_lowercase(uchar, lang)
for order, (char, ratio) in enumerate(sorted_ratios): for order, (char, ratio) in enumerate(sorted_ratios):
if char == ord(uchar): if char == ord(uchar):
CTOM_str += '{:3},'.format(order) CTOM_str += '{:3},'.format(order)