script: move the Wikipedia title syntax cleaning to BuildLangModel.py.

2026-01-01 03:12:24 +08:00 · 2016-02-21 16:20:22 +01:00 · 2016-02-21 16:20:22 +01:00 · 198190461e
commit 198190461e
parent d24bd7d578
12 changed files with 9 additions and 88 deletions
--- a/script/BuildLangModel.py
+++ b/script/BuildLangModel.py
@ -172,6 +172,9 @@ def process_text(text, lang):

    if lang.clean_wikipedia_content is not None:
        content = lang.clean_wikipedia_content(text)
+    # Clean out the Wikipedia syntax for titles.
+    content = re.sub(r'(=+) *([^=]+) *\1',
+                     r'\2', content)
    # Clean multiple spaces. Newlines and such are normalized to spaces,
    # since they have basically a similar role in the purpose of uchardet.
    content = re.sub(r'\s+', ' ', content)
--- a/script/langs/ar.py
+++ b/script/langs/ar.py
@ -57,11 +57,3 @@ charsets = ['ISO-8859-6', 'WINDOWS-1256']
 start_pages = ['الصفحة_الرئيسية']
 wikipedia_code = code
 case_mapping = False
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
--- a/script/langs/da.py
+++ b/script/langs/da.py
@ -67,12 +67,3 @@ wikipedia_code = code
 # This uses Python algorithm to determine upper/lower-case of a given
 # character.
 case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    # We get modify link in the text: "=== Articles connexesModifier ==="
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
--- a/script/langs/de.py
+++ b/script/langs/de.py
@ -67,12 +67,3 @@ wikipedia_code = code
 # This uses Python algorithm to determine upper/lower-case of a given
 # character.
 case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    # Get rid of title syntax: "=== Articles connexes ==="
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
--- a/script/langs/el.py
+++ b/script/langs/el.py
@ -53,11 +53,3 @@ alphabet = 'αβγδεζηθικλμνξοπρσςτυφχψω'
 start_pages = ['Πύλη:Κύρια']
 wikipedia_code = code
 case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
--- a/script/langs/eo.py
+++ b/script/langs/eo.py
@ -65,12 +65,3 @@ wikipedia_code = code
 # This uses Python algorithm to determine upper/lower-case of a given
 # character.
 case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    # Get rid of title syntax: "=== Articles connexes ==="
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
--- a/script/langs/es.py
+++ b/script/langs/es.py
@ -67,11 +67,3 @@ wikipedia_code = code
 # This uses Python algorithm to determine upper/lower-case of a given
 # character.
 case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
--- a/script/langs/fr.py
+++ b/script/langs/fr.py
@ -70,9 +70,10 @@ case_mapping = True

 # A function to clean content returned by the `wikipedia` python lib,
 # in case some unwanted data has been overlooked.
+# Note that we are already cleaning away the '=' from the title syntax
+# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in
+# some language may return weird syntax or UI text which should be
+# discarded. If you encounter one of these cases, use this function.
 def clean_wikipedia_content(content):
-    # We get modify link in the text: "=== Articles connexesModifier ==="
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
+    # Do your garbage text cleaning here.
+    return content
--- a/script/langs/hu.py
+++ b/script/langs/hu.py
@ -64,11 +64,3 @@ wikipedia_code = code
 # This uses Python algorithm to determine upper/lower-case of a given
 # character.
 case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
--- a/script/langs/th.py
+++ b/script/langs/th.py
@ -53,10 +53,3 @@ charsets = ['ISO-8859-11', 'TIS-620']
 start_pages = ['หน้าหลัก']
 wikipedia_code = code
 case_mapping = False
-
-def clean_wikipedia_content(content):
-    # Get rid of title syntax: "=== Articles connexes ==="
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
--- a/script/langs/tr.py
+++ b/script/langs/tr.py
@ -65,12 +65,3 @@ wikipedia_code = code
 # This is wrong when it comes to Turkish.
 custom_case_mapping = { 'İ': 'i', 'I': 'ı' }
 case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    # Get rid of title syntax: "=== Articles connexes ==="
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
--- a/script/langs/vi.py
+++ b/script/langs/vi.py
@ -62,11 +62,3 @@ alphabet = 'aăâbcdđeêghiklmnoôơpqrstuưvxy'
 start_pages = ['Chữ_Quốc_ngữ']
 wikipedia_code = code
 case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned