mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 08:46:40 +08:00
script: move the Wikipedia title syntax cleaning to BuildLangModel.py.
This commit is contained in:
parent
d24bd7d578
commit
198190461e
@ -172,6 +172,9 @@ def process_text(text, lang):
|
||||
|
||||
if lang.clean_wikipedia_content is not None:
|
||||
content = lang.clean_wikipedia_content(text)
|
||||
# Clean out the Wikipedia syntax for titles.
|
||||
content = re.sub(r'(=+) *([^=]+) *\1',
|
||||
r'\2', content)
|
||||
# Clean multiple spaces. Newlines and such are normalized to spaces,
|
||||
# since they have basically a similar role in the purpose of uchardet.
|
||||
content = re.sub(r'\s+', ' ', content)
|
||||
|
||||
@ -57,11 +57,3 @@ charsets = ['ISO-8859-6', 'WINDOWS-1256']
|
||||
start_pages = ['الصفحة_الرئيسية']
|
||||
wikipedia_code = code
|
||||
case_mapping = False
|
||||
|
||||
# A function to clean content returned by the `wikipedia` python lib,
|
||||
# in case some unwanted data has been overlooked.
|
||||
def clean_wikipedia_content(content):
|
||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
||||
r'\2',
|
||||
content)
|
||||
return cleaned
|
||||
|
||||
@ -67,12 +67,3 @@ wikipedia_code = code
|
||||
# This uses Python algorithm to determine upper/lower-case of a given
|
||||
# character.
|
||||
case_mapping = True
|
||||
|
||||
# A function to clean content returned by the `wikipedia` python lib,
|
||||
# in case some unwanted data has been overlooked.
|
||||
def clean_wikipedia_content(content):
|
||||
# We get modify link in the text: "=== Articles connexesModifier ==="
|
||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
||||
r'\2',
|
||||
content)
|
||||
return cleaned
|
||||
|
||||
@ -67,12 +67,3 @@ wikipedia_code = code
|
||||
# This uses Python algorithm to determine upper/lower-case of a given
|
||||
# character.
|
||||
case_mapping = True
|
||||
|
||||
# A function to clean content returned by the `wikipedia` python lib,
|
||||
# in case some unwanted data has been overlooked.
|
||||
def clean_wikipedia_content(content):
|
||||
# Get rid of title syntax: "=== Articles connexes ==="
|
||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
||||
r'\2',
|
||||
content)
|
||||
return cleaned
|
||||
|
||||
@ -53,11 +53,3 @@ alphabet = 'αβγδεζηθικλμνξοπρσςτυφχψω'
|
||||
start_pages = ['Πύλη:Κύρια']
|
||||
wikipedia_code = code
|
||||
case_mapping = True
|
||||
|
||||
# A function to clean content returned by the `wikipedia` python lib,
|
||||
# in case some unwanted data has been overlooked.
|
||||
def clean_wikipedia_content(content):
|
||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
||||
r'\2',
|
||||
content)
|
||||
return cleaned
|
||||
|
||||
@ -65,12 +65,3 @@ wikipedia_code = code
|
||||
# This uses Python algorithm to determine upper/lower-case of a given
|
||||
# character.
|
||||
case_mapping = True
|
||||
|
||||
# A function to clean content returned by the `wikipedia` python lib,
|
||||
# in case some unwanted data has been overlooked.
|
||||
def clean_wikipedia_content(content):
|
||||
# Get rid of title syntax: "=== Articles connexes ==="
|
||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
||||
r'\2',
|
||||
content)
|
||||
return cleaned
|
||||
|
||||
@ -67,11 +67,3 @@ wikipedia_code = code
|
||||
# This uses Python algorithm to determine upper/lower-case of a given
|
||||
# character.
|
||||
case_mapping = True
|
||||
|
||||
# A function to clean content returned by the `wikipedia` python lib,
|
||||
# in case some unwanted data has been overlooked.
|
||||
def clean_wikipedia_content(content):
|
||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
||||
r'\2',
|
||||
content)
|
||||
return cleaned
|
||||
|
||||
@ -70,9 +70,10 @@ case_mapping = True
|
||||
|
||||
# A function to clean content returned by the `wikipedia` python lib,
|
||||
# in case some unwanted data has been overlooked.
|
||||
# Note that we are already cleaning away the '=' from the title syntax
|
||||
# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in
|
||||
# some language may return weird syntax or UI text which should be
|
||||
# discarded. If you encounter one of these cases, use this function.
|
||||
def clean_wikipedia_content(content):
|
||||
# We get modify link in the text: "=== Articles connexesModifier ==="
|
||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
||||
r'\2',
|
||||
content)
|
||||
return cleaned
|
||||
# Do your garbage text cleaning here.
|
||||
return content
|
||||
|
||||
@ -64,11 +64,3 @@ wikipedia_code = code
|
||||
# This uses Python algorithm to determine upper/lower-case of a given
|
||||
# character.
|
||||
case_mapping = True
|
||||
|
||||
# A function to clean content returned by the `wikipedia` python lib,
|
||||
# in case some unwanted data has been overlooked.
|
||||
def clean_wikipedia_content(content):
|
||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
||||
r'\2',
|
||||
content)
|
||||
return cleaned
|
||||
|
||||
@ -53,10 +53,3 @@ charsets = ['ISO-8859-11', 'TIS-620']
|
||||
start_pages = ['หน้าหลัก']
|
||||
wikipedia_code = code
|
||||
case_mapping = False
|
||||
|
||||
def clean_wikipedia_content(content):
|
||||
# Get rid of title syntax: "=== Articles connexes ==="
|
||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
||||
r'\2',
|
||||
content)
|
||||
return cleaned
|
||||
|
||||
@ -65,12 +65,3 @@ wikipedia_code = code
|
||||
# This is wrong when it comes to Turkish.
|
||||
custom_case_mapping = { 'İ': 'i', 'I': 'ı' }
|
||||
case_mapping = True
|
||||
|
||||
# A function to clean content returned by the `wikipedia` python lib,
|
||||
# in case some unwanted data has been overlooked.
|
||||
def clean_wikipedia_content(content):
|
||||
# Get rid of title syntax: "=== Articles connexes ==="
|
||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
||||
r'\2',
|
||||
content)
|
||||
return cleaned
|
||||
|
||||
@ -62,11 +62,3 @@ alphabet = 'aăâbcdđeêghiklmnoôơpqrstuưvxy'
|
||||
start_pages = ['Chữ_Quốc_ngữ']
|
||||
wikipedia_code = code
|
||||
case_mapping = True
|
||||
|
||||
# A function to clean content returned by the `wikipedia` python lib,
|
||||
# in case some unwanted data has been overlooked.
|
||||
def clean_wikipedia_content(content):
|
||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
||||
r'\2',
|
||||
content)
|
||||
return cleaned
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user