mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
script: move the Wikipedia title syntax cleaning to BuildLangModel.py.
This commit is contained in:
parent
d24bd7d578
commit
198190461e
@ -172,6 +172,9 @@ def process_text(text, lang):
|
|||||||
|
|
||||||
if lang.clean_wikipedia_content is not None:
|
if lang.clean_wikipedia_content is not None:
|
||||||
content = lang.clean_wikipedia_content(text)
|
content = lang.clean_wikipedia_content(text)
|
||||||
|
# Clean out the Wikipedia syntax for titles.
|
||||||
|
content = re.sub(r'(=+) *([^=]+) *\1',
|
||||||
|
r'\2', content)
|
||||||
# Clean multiple spaces. Newlines and such are normalized to spaces,
|
# Clean multiple spaces. Newlines and such are normalized to spaces,
|
||||||
# since they have basically a similar role in the purpose of uchardet.
|
# since they have basically a similar role in the purpose of uchardet.
|
||||||
content = re.sub(r'\s+', ' ', content)
|
content = re.sub(r'\s+', ' ', content)
|
||||||
|
|||||||
@ -57,11 +57,3 @@ charsets = ['ISO-8859-6', 'WINDOWS-1256']
|
|||||||
start_pages = ['الصفحة_الرئيسية']
|
start_pages = ['الصفحة_الرئيسية']
|
||||||
wikipedia_code = code
|
wikipedia_code = code
|
||||||
case_mapping = False
|
case_mapping = False
|
||||||
|
|
||||||
# A function to clean content returned by the `wikipedia` python lib,
|
|
||||||
# in case some unwanted data has been overlooked.
|
|
||||||
def clean_wikipedia_content(content):
|
|
||||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
|
||||||
r'\2',
|
|
||||||
content)
|
|
||||||
return cleaned
|
|
||||||
|
|||||||
@ -67,12 +67,3 @@ wikipedia_code = code
|
|||||||
# This uses Python algorithm to determine upper/lower-case of a given
|
# This uses Python algorithm to determine upper/lower-case of a given
|
||||||
# character.
|
# character.
|
||||||
case_mapping = True
|
case_mapping = True
|
||||||
|
|
||||||
# A function to clean content returned by the `wikipedia` python lib,
|
|
||||||
# in case some unwanted data has been overlooked.
|
|
||||||
def clean_wikipedia_content(content):
|
|
||||||
# We get modify link in the text: "=== Articles connexesModifier ==="
|
|
||||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
|
||||||
r'\2',
|
|
||||||
content)
|
|
||||||
return cleaned
|
|
||||||
|
|||||||
@ -67,12 +67,3 @@ wikipedia_code = code
|
|||||||
# This uses Python algorithm to determine upper/lower-case of a given
|
# This uses Python algorithm to determine upper/lower-case of a given
|
||||||
# character.
|
# character.
|
||||||
case_mapping = True
|
case_mapping = True
|
||||||
|
|
||||||
# A function to clean content returned by the `wikipedia` python lib,
|
|
||||||
# in case some unwanted data has been overlooked.
|
|
||||||
def clean_wikipedia_content(content):
|
|
||||||
# Get rid of title syntax: "=== Articles connexes ==="
|
|
||||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
|
||||||
r'\2',
|
|
||||||
content)
|
|
||||||
return cleaned
|
|
||||||
|
|||||||
@ -53,11 +53,3 @@ alphabet = 'αβγδεζηθικλμνξοπρσςτυφχψω'
|
|||||||
start_pages = ['Πύλη:Κύρια']
|
start_pages = ['Πύλη:Κύρια']
|
||||||
wikipedia_code = code
|
wikipedia_code = code
|
||||||
case_mapping = True
|
case_mapping = True
|
||||||
|
|
||||||
# A function to clean content returned by the `wikipedia` python lib,
|
|
||||||
# in case some unwanted data has been overlooked.
|
|
||||||
def clean_wikipedia_content(content):
|
|
||||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
|
||||||
r'\2',
|
|
||||||
content)
|
|
||||||
return cleaned
|
|
||||||
|
|||||||
@ -65,12 +65,3 @@ wikipedia_code = code
|
|||||||
# This uses Python algorithm to determine upper/lower-case of a given
|
# This uses Python algorithm to determine upper/lower-case of a given
|
||||||
# character.
|
# character.
|
||||||
case_mapping = True
|
case_mapping = True
|
||||||
|
|
||||||
# A function to clean content returned by the `wikipedia` python lib,
|
|
||||||
# in case some unwanted data has been overlooked.
|
|
||||||
def clean_wikipedia_content(content):
|
|
||||||
# Get rid of title syntax: "=== Articles connexes ==="
|
|
||||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
|
||||||
r'\2',
|
|
||||||
content)
|
|
||||||
return cleaned
|
|
||||||
|
|||||||
@ -67,11 +67,3 @@ wikipedia_code = code
|
|||||||
# This uses Python algorithm to determine upper/lower-case of a given
|
# This uses Python algorithm to determine upper/lower-case of a given
|
||||||
# character.
|
# character.
|
||||||
case_mapping = True
|
case_mapping = True
|
||||||
|
|
||||||
# A function to clean content returned by the `wikipedia` python lib,
|
|
||||||
# in case some unwanted data has been overlooked.
|
|
||||||
def clean_wikipedia_content(content):
|
|
||||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
|
||||||
r'\2',
|
|
||||||
content)
|
|
||||||
return cleaned
|
|
||||||
|
|||||||
@ -70,9 +70,10 @@ case_mapping = True
|
|||||||
|
|
||||||
# A function to clean content returned by the `wikipedia` python lib,
|
# A function to clean content returned by the `wikipedia` python lib,
|
||||||
# in case some unwanted data has been overlooked.
|
# in case some unwanted data has been overlooked.
|
||||||
|
# Note that we are already cleaning away the '=' from the title syntax
|
||||||
|
# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in
|
||||||
|
# some language may return weird syntax or UI text which should be
|
||||||
|
# discarded. If you encounter one of these cases, use this function.
|
||||||
def clean_wikipedia_content(content):
|
def clean_wikipedia_content(content):
|
||||||
# We get modify link in the text: "=== Articles connexesModifier ==="
|
# Do your garbage text cleaning here.
|
||||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
return content
|
||||||
r'\2',
|
|
||||||
content)
|
|
||||||
return cleaned
|
|
||||||
|
|||||||
@ -64,11 +64,3 @@ wikipedia_code = code
|
|||||||
# This uses Python algorithm to determine upper/lower-case of a given
|
# This uses Python algorithm to determine upper/lower-case of a given
|
||||||
# character.
|
# character.
|
||||||
case_mapping = True
|
case_mapping = True
|
||||||
|
|
||||||
# A function to clean content returned by the `wikipedia` python lib,
|
|
||||||
# in case some unwanted data has been overlooked.
|
|
||||||
def clean_wikipedia_content(content):
|
|
||||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
|
||||||
r'\2',
|
|
||||||
content)
|
|
||||||
return cleaned
|
|
||||||
|
|||||||
@ -53,10 +53,3 @@ charsets = ['ISO-8859-11', 'TIS-620']
|
|||||||
start_pages = ['หน้าหลัก']
|
start_pages = ['หน้าหลัก']
|
||||||
wikipedia_code = code
|
wikipedia_code = code
|
||||||
case_mapping = False
|
case_mapping = False
|
||||||
|
|
||||||
def clean_wikipedia_content(content):
|
|
||||||
# Get rid of title syntax: "=== Articles connexes ==="
|
|
||||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
|
||||||
r'\2',
|
|
||||||
content)
|
|
||||||
return cleaned
|
|
||||||
|
|||||||
@ -65,12 +65,3 @@ wikipedia_code = code
|
|||||||
# This is wrong when it comes to Turkish.
|
# This is wrong when it comes to Turkish.
|
||||||
custom_case_mapping = { 'İ': 'i', 'I': 'ı' }
|
custom_case_mapping = { 'İ': 'i', 'I': 'ı' }
|
||||||
case_mapping = True
|
case_mapping = True
|
||||||
|
|
||||||
# A function to clean content returned by the `wikipedia` python lib,
|
|
||||||
# in case some unwanted data has been overlooked.
|
|
||||||
def clean_wikipedia_content(content):
|
|
||||||
# Get rid of title syntax: "=== Articles connexes ==="
|
|
||||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
|
||||||
r'\2',
|
|
||||||
content)
|
|
||||||
return cleaned
|
|
||||||
|
|||||||
@ -62,11 +62,3 @@ alphabet = 'aăâbcdđeêghiklmnoôơpqrstuưvxy'
|
|||||||
start_pages = ['Chữ_Quốc_ngữ']
|
start_pages = ['Chữ_Quốc_ngữ']
|
||||||
wikipedia_code = code
|
wikipedia_code = code
|
||||||
case_mapping = True
|
case_mapping = True
|
||||||
|
|
||||||
# A function to clean content returned by the `wikipedia` python lib,
|
|
||||||
# in case some unwanted data has been overlooked.
|
|
||||||
def clean_wikipedia_content(content):
|
|
||||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
|
||||||
r'\2',
|
|
||||||
content)
|
|
||||||
return cleaned
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user