mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-13 15:10:06 +08:00
BuildLangModel: map different cases of the same character together.
With the new case_mapping lang property, we can consider upper and lower case versions of the same character as one character. This makes sense in some language, and would allow to enter some rarer characters (but still in the main alphabet) inside the frequent character list. For instance 'œ' and 'Œ' in French.
This commit is contained in:
parent
00a78faa1d
commit
7f290975ba
@ -102,6 +102,10 @@ if not hasattr(lang, 'wikipedia_code') or lang.wikipedia_code is None:
|
||||
lang.wikipedia_code = lang.code
|
||||
if not hasattr(lang, 'clean_wikipedia_content') or lang.clean_wikipedia_content is None:
|
||||
lang.clean_wikipedia_content = None
|
||||
if hasattr(lang, 'case_mapping'):
|
||||
lang.case_mapping = bool(lang.case_mapping)
|
||||
else:
|
||||
lang.case_mapping = False
|
||||
|
||||
# Starting processing.
|
||||
wikipedia.set_lang(lang.wikipedia_code)
|
||||
@ -125,6 +129,7 @@ def visit_page(title, depth, clean_text, logfd):
|
||||
global sequences
|
||||
global prev_char
|
||||
global options
|
||||
global lang
|
||||
|
||||
if options.max_page is not None and \
|
||||
len(visited_pages) > options.max_page:
|
||||
@ -145,6 +150,9 @@ def visit_page(title, depth, clean_text, logfd):
|
||||
# since they have basically a similar role in the purpose of uchardet.
|
||||
content = re.sub(r'\s+', ' ', content)
|
||||
|
||||
if lang.case_mapping:
|
||||
content = content.lower()
|
||||
|
||||
# In python 3, strings are UTF-8.
|
||||
# Looping through them return expected characters.
|
||||
for char in content:
|
||||
@ -282,6 +290,8 @@ for charset in charsets:
|
||||
CTOM_str += 'NUM,'
|
||||
else: # LET
|
||||
uchar = bytes([cp]).decode(charset)
|
||||
if lang.case_mapping and uchar.isupper():
|
||||
uchar = uchar.lower()
|
||||
for order, (char, ratio) in enumerate(sorted_ratios):
|
||||
if char == ord(uchar):
|
||||
CTOM_str += '{:3},'.format(order)
|
||||
|
||||
@ -58,6 +58,10 @@ charsets = ['ISO-8859-15', 'ISO-8859-1']
|
||||
start_page = 'Wikipédia:Accueil_principal'
|
||||
# give possibility to select another code for the Wikipedia URL.
|
||||
wikipedia_code = code
|
||||
# 'a' and 'A' will be considered the same character, and so on.
|
||||
# This uses Python algorithm to determine upper/lower-case of a given
|
||||
# character.
|
||||
case_mapping = True
|
||||
|
||||
# A function to clean content returned by the `wikipedia` python lib,
|
||||
# in case some unwanted data has been overlooked.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user