From 198190461e35a7a03ea3364cded69dccf67f8250 Mon Sep 17 00:00:00 2001 From: Jehan Date: Sun, 21 Feb 2016 16:20:22 +0100 Subject: [PATCH] script: move the Wikipedia title syntax cleaning to BuildLangModel.py. --- script/BuildLangModel.py | 3 +++ script/langs/ar.py | 8 -------- script/langs/da.py | 9 --------- script/langs/de.py | 9 --------- script/langs/el.py | 8 -------- script/langs/eo.py | 9 --------- script/langs/es.py | 8 -------- script/langs/fr.py | 11 ++++++----- script/langs/hu.py | 8 -------- script/langs/th.py | 7 ------- script/langs/tr.py | 9 --------- script/langs/vi.py | 8 -------- 12 files changed, 9 insertions(+), 88 deletions(-) diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py index 8ed52cf..a412f13 100755 --- a/script/BuildLangModel.py +++ b/script/BuildLangModel.py @@ -172,6 +172,9 @@ def process_text(text, lang): if lang.clean_wikipedia_content is not None: content = lang.clean_wikipedia_content(text) + # Clean out the Wikipedia syntax for titles. + content = re.sub(r'(=+) *([^=]+) *\1', + r'\2', content) # Clean multiple spaces. Newlines and such are normalized to spaces, # since they have basically a similar role in the purpose of uchardet. content = re.sub(r'\s+', ' ', content) diff --git a/script/langs/ar.py b/script/langs/ar.py index 05952b8..2506e7b 100644 --- a/script/langs/ar.py +++ b/script/langs/ar.py @@ -57,11 +57,3 @@ charsets = ['ISO-8859-6', 'WINDOWS-1256'] start_pages = ['الصفحة_الرئيسية'] wikipedia_code = code case_mapping = False - -# A function to clean content returned by the `wikipedia` python lib, -# in case some unwanted data has been overlooked. -def clean_wikipedia_content(content): - cleaned = re.sub(r'(=+) *([^=]+) *\1', - r'\2', - content) - return cleaned diff --git a/script/langs/da.py b/script/langs/da.py index df94208..18d2379 100644 --- a/script/langs/da.py +++ b/script/langs/da.py @@ -67,12 +67,3 @@ wikipedia_code = code # This uses Python algorithm to determine upper/lower-case of a given # character. case_mapping = True - -# A function to clean content returned by the `wikipedia` python lib, -# in case some unwanted data has been overlooked. -def clean_wikipedia_content(content): - # We get modify link in the text: "=== Articles connexesModifier ===" - cleaned = re.sub(r'(=+) *([^=]+) *\1', - r'\2', - content) - return cleaned diff --git a/script/langs/de.py b/script/langs/de.py index 554f142..e004901 100644 --- a/script/langs/de.py +++ b/script/langs/de.py @@ -67,12 +67,3 @@ wikipedia_code = code # This uses Python algorithm to determine upper/lower-case of a given # character. case_mapping = True - -# A function to clean content returned by the `wikipedia` python lib, -# in case some unwanted data has been overlooked. -def clean_wikipedia_content(content): - # Get rid of title syntax: "=== Articles connexes ===" - cleaned = re.sub(r'(=+) *([^=]+) *\1', - r'\2', - content) - return cleaned diff --git a/script/langs/el.py b/script/langs/el.py index efd9a3e..2726229 100644 --- a/script/langs/el.py +++ b/script/langs/el.py @@ -53,11 +53,3 @@ alphabet = 'αβγδεζηθικλμνξοπρσςτυφχψω' start_pages = ['Πύλη:Κύρια'] wikipedia_code = code case_mapping = True - -# A function to clean content returned by the `wikipedia` python lib, -# in case some unwanted data has been overlooked. -def clean_wikipedia_content(content): - cleaned = re.sub(r'(=+) *([^=]+) *\1', - r'\2', - content) - return cleaned diff --git a/script/langs/eo.py b/script/langs/eo.py index c593921..e9430cc 100644 --- a/script/langs/eo.py +++ b/script/langs/eo.py @@ -65,12 +65,3 @@ wikipedia_code = code # This uses Python algorithm to determine upper/lower-case of a given # character. case_mapping = True - -# A function to clean content returned by the `wikipedia` python lib, -# in case some unwanted data has been overlooked. -def clean_wikipedia_content(content): - # Get rid of title syntax: "=== Articles connexes ===" - cleaned = re.sub(r'(=+) *([^=]+) *\1', - r'\2', - content) - return cleaned diff --git a/script/langs/es.py b/script/langs/es.py index f48acc5..5219296 100644 --- a/script/langs/es.py +++ b/script/langs/es.py @@ -67,11 +67,3 @@ wikipedia_code = code # This uses Python algorithm to determine upper/lower-case of a given # character. case_mapping = True - -# A function to clean content returned by the `wikipedia` python lib, -# in case some unwanted data has been overlooked. -def clean_wikipedia_content(content): - cleaned = re.sub(r'(=+) *([^=]+) *\1', - r'\2', - content) - return cleaned diff --git a/script/langs/fr.py b/script/langs/fr.py index 9312b7b..fff730b 100644 --- a/script/langs/fr.py +++ b/script/langs/fr.py @@ -70,9 +70,10 @@ case_mapping = True # A function to clean content returned by the `wikipedia` python lib, # in case some unwanted data has been overlooked. +# Note that we are already cleaning away the '=' from the title syntax +# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in +# some language may return weird syntax or UI text which should be +# discarded. If you encounter one of these cases, use this function. def clean_wikipedia_content(content): - # We get modify link in the text: "=== Articles connexesModifier ===" - cleaned = re.sub(r'(=+) *([^=]+) *\1', - r'\2', - content) - return cleaned + # Do your garbage text cleaning here. + return content diff --git a/script/langs/hu.py b/script/langs/hu.py index 8ff01cb..e6ee345 100644 --- a/script/langs/hu.py +++ b/script/langs/hu.py @@ -64,11 +64,3 @@ wikipedia_code = code # This uses Python algorithm to determine upper/lower-case of a given # character. case_mapping = True - -# A function to clean content returned by the `wikipedia` python lib, -# in case some unwanted data has been overlooked. -def clean_wikipedia_content(content): - cleaned = re.sub(r'(=+) *([^=]+) *\1', - r'\2', - content) - return cleaned diff --git a/script/langs/th.py b/script/langs/th.py index 3ddeee1..eb3fdaa 100644 --- a/script/langs/th.py +++ b/script/langs/th.py @@ -53,10 +53,3 @@ charsets = ['ISO-8859-11', 'TIS-620'] start_pages = ['หน้าหลัก'] wikipedia_code = code case_mapping = False - -def clean_wikipedia_content(content): - # Get rid of title syntax: "=== Articles connexes ===" - cleaned = re.sub(r'(=+) *([^=]+) *\1', - r'\2', - content) - return cleaned diff --git a/script/langs/tr.py b/script/langs/tr.py index 521c7da..d8b5ac1 100644 --- a/script/langs/tr.py +++ b/script/langs/tr.py @@ -65,12 +65,3 @@ wikipedia_code = code # This is wrong when it comes to Turkish. custom_case_mapping = { 'İ': 'i', 'I': 'ı' } case_mapping = True - -# A function to clean content returned by the `wikipedia` python lib, -# in case some unwanted data has been overlooked. -def clean_wikipedia_content(content): - # Get rid of title syntax: "=== Articles connexes ===" - cleaned = re.sub(r'(=+) *([^=]+) *\1', - r'\2', - content) - return cleaned diff --git a/script/langs/vi.py b/script/langs/vi.py index 3a38cc4..f44aeb6 100644 --- a/script/langs/vi.py +++ b/script/langs/vi.py @@ -62,11 +62,3 @@ alphabet = 'aăâbcdđeêghiklmnoôơpqrstuưvxy' start_pages = ['Chữ_Quốc_ngữ'] wikipedia_code = code case_mapping = True - -# A function to clean content returned by the `wikipedia` python lib, -# in case some unwanted data has been overlooked. -def clean_wikipedia_content(content): - cleaned = re.sub(r'(=+) *([^=]+) *\1', - r'\2', - content) - return cleaned