script: move the Wikipedia title syntax cleaning to BuildLangModel.py.

This commit is contained in:
Jehan 2016-02-21 16:20:22 +01:00
parent d24bd7d578
commit 198190461e
12 changed files with 9 additions and 88 deletions

View File

@ -172,6 +172,9 @@ def process_text(text, lang):
if lang.clean_wikipedia_content is not None:
content = lang.clean_wikipedia_content(text)
# Clean out the Wikipedia syntax for titles.
content = re.sub(r'(=+) *([^=]+) *\1',
r'\2', content)
# Clean multiple spaces. Newlines and such are normalized to spaces,
# since they have basically a similar role in the purpose of uchardet.
content = re.sub(r'\s+', ' ', content)

View File

@ -57,11 +57,3 @@ charsets = ['ISO-8859-6', 'WINDOWS-1256']
start_pages = ['الصفحة_الرئيسية']
wikipedia_code = code
case_mapping = False
# A function to clean content returned by the `wikipedia` python lib,
# in case some unwanted data has been overlooked.
def clean_wikipedia_content(content):
cleaned = re.sub(r'(=+) *([^=]+) *\1',
r'\2',
content)
return cleaned

View File

@ -67,12 +67,3 @@ wikipedia_code = code
# This uses Python algorithm to determine upper/lower-case of a given
# character.
case_mapping = True
# A function to clean content returned by the `wikipedia` python lib,
# in case some unwanted data has been overlooked.
def clean_wikipedia_content(content):
# We get modify link in the text: "=== Articles connexesModifier ==="
cleaned = re.sub(r'(=+) *([^=]+) *\1',
r'\2',
content)
return cleaned

View File

@ -67,12 +67,3 @@ wikipedia_code = code
# This uses Python algorithm to determine upper/lower-case of a given
# character.
case_mapping = True
# A function to clean content returned by the `wikipedia` python lib,
# in case some unwanted data has been overlooked.
def clean_wikipedia_content(content):
# Get rid of title syntax: "=== Articles connexes ==="
cleaned = re.sub(r'(=+) *([^=]+) *\1',
r'\2',
content)
return cleaned

View File

@ -53,11 +53,3 @@ alphabet = 'αβγδεζηθικλμνξοπρσςτυφχψω'
start_pages = ['Πύλη:Κύρια']
wikipedia_code = code
case_mapping = True
# A function to clean content returned by the `wikipedia` python lib,
# in case some unwanted data has been overlooked.
def clean_wikipedia_content(content):
cleaned = re.sub(r'(=+) *([^=]+) *\1',
r'\2',
content)
return cleaned

View File

@ -65,12 +65,3 @@ wikipedia_code = code
# This uses Python algorithm to determine upper/lower-case of a given
# character.
case_mapping = True
# A function to clean content returned by the `wikipedia` python lib,
# in case some unwanted data has been overlooked.
def clean_wikipedia_content(content):
# Get rid of title syntax: "=== Articles connexes ==="
cleaned = re.sub(r'(=+) *([^=]+) *\1',
r'\2',
content)
return cleaned

View File

@ -67,11 +67,3 @@ wikipedia_code = code
# This uses Python algorithm to determine upper/lower-case of a given
# character.
case_mapping = True
# A function to clean content returned by the `wikipedia` python lib,
# in case some unwanted data has been overlooked.
def clean_wikipedia_content(content):
cleaned = re.sub(r'(=+) *([^=]+) *\1',
r'\2',
content)
return cleaned

View File

@ -70,9 +70,10 @@ case_mapping = True
# A function to clean content returned by the `wikipedia` python lib,
# in case some unwanted data has been overlooked.
# Note that we are already cleaning away the '=' from the title syntax
# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in
# some language may return weird syntax or UI text which should be
# discarded. If you encounter one of these cases, use this function.
def clean_wikipedia_content(content):
# We get modify link in the text: "=== Articles connexesModifier ==="
cleaned = re.sub(r'(=+) *([^=]+) *\1',
r'\2',
content)
return cleaned
# Do your garbage text cleaning here.
return content

View File

@ -64,11 +64,3 @@ wikipedia_code = code
# This uses Python algorithm to determine upper/lower-case of a given
# character.
case_mapping = True
# A function to clean content returned by the `wikipedia` python lib,
# in case some unwanted data has been overlooked.
def clean_wikipedia_content(content):
cleaned = re.sub(r'(=+) *([^=]+) *\1',
r'\2',
content)
return cleaned

View File

@ -53,10 +53,3 @@ charsets = ['ISO-8859-11', 'TIS-620']
start_pages = ['หน้าหลัก']
wikipedia_code = code
case_mapping = False
def clean_wikipedia_content(content):
# Get rid of title syntax: "=== Articles connexes ==="
cleaned = re.sub(r'(=+) *([^=]+) *\1',
r'\2',
content)
return cleaned

View File

@ -65,12 +65,3 @@ wikipedia_code = code
# This is wrong when it comes to Turkish.
custom_case_mapping = { 'İ': 'i', 'I': 'ı' }
case_mapping = True
# A function to clean content returned by the `wikipedia` python lib,
# in case some unwanted data has been overlooked.
def clean_wikipedia_content(content):
# Get rid of title syntax: "=== Articles connexes ==="
cleaned = re.sub(r'(=+) *([^=]+) *\1',
r'\2',
content)
return cleaned

View File

@ -62,11 +62,3 @@ alphabet = 'aăâbcdđeêghiklmnoôơpqrstuưvxy'
start_pages = ['Chữ_Quốc_ngữ']
wikipedia_code = code
case_mapping = True
# A function to clean content returned by the `wikipedia` python lib,
# in case some unwanted data has been overlooked.
def clean_wikipedia_content(content):
cleaned = re.sub(r'(=+) *([^=]+) *\1',
r'\2',
content)
return cleaned