From 600cf76a768df4681863096de47a150215caf0bb Mon Sep 17 00:00:00 2001 From: Jehan Date: Sat, 13 Feb 2016 03:47:41 +0100 Subject: [PATCH] BuildLangModel: try using iconv for conversion when support missing... ... in python. For instance I had the case where the VISCII encoding is supported by iconv but not by encode/decode() function in core python. --- script/BuildLangModel.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py index 8d84ff1..8ed52cf 100755 --- a/script/BuildLangModel.py +++ b/script/BuildLangModel.py @@ -40,6 +40,7 @@ # Third party modules. import unicodedata +import subprocess import wikipedia import importlib import optparse @@ -190,7 +191,22 @@ def process_text(text, lang): # language encodings and its not a special character. for charset in charsets: # Does the character exist in the charset? - codepoint = char.encode(charset, 'ignore') + try: + codepoint = char.encode(charset, 'ignore') + except LookupError: + # unknown encoding. Use iconv from command line instead. + try: + call = subprocess.Popen(['iconv', '-f', 'UTF-8', '-t', charset], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL) + if call.poll() is not None: + (_, error) = call.communicate(input='') + print('Error: `iconv` ended with error "{}".\n'.format(error)) + exit(1) + (codepoint, _) = call.communicate(input=char.encode('UTF-8')) + except FileNotFoundError: + print('Error: "{}" is not a supported charset by python and `iconv` is not installed.\n') + exit(1) if codepoint == b'': continue @@ -368,6 +384,22 @@ for charset in charsets: print('Unknown character 0X{:X} in {}.'.format(cp, charset)) print('Please verify your charset specification.\n') exit(1) + except LookupError: + # Unknown encoding. Use iconv instead. + try: + call = subprocess.Popen(['iconv', '-t', 'UTF-8', '-f', charset], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if call.poll() is not None: + (_, error) = call.communicate(input='') + print('Error: `iconv` ended with error "{}".\n'.format(error)) + exit(1) + (uchar, _) = call.communicate(input=bytes([cp])) + uchar = uchar.decode('UTF-8') + except FileNotFoundError: + print('Error: "{}" is not a supported charset by python and `iconv` is not installed.\n') + exit(1) #if lang.case_mapping and uchar.isupper() and \ #len(unicodedata.normalize('NFC', uchar.lower())) == 1: # Unless we encounter special cases of characters with no