#!/bin/python3 # -*- coding: utf-8 -*- # ##### BEGIN LICENSE BLOCK ##### # Version: MPL 1.1/GPL 2.0/LGPL 2.1 # # The contents of this file are subject to the Mozilla Public License Version # 1.1 (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # http://www.mozilla.org/MPL/ # # Software distributed under the License is distributed on an "AS IS" basis, # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License # for the specific language governing rights and limitations under the # License. # # The Original Code is Mozilla Universal charset detector code. # # The Initial Developer of the Original Code is # Netscape Communications Corporation. # Portions created by the Initial Developer are Copyright (C) 2001 # the Initial Developer. All Rights Reserved. # # Contributor(s): # Jehan # # Alternatively, the contents of this file may be used under the terms of # either the GNU General Public License Version 2 or later (the "GPL"), or # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), # in which case the provisions of the GPL or the LGPL are applicable instead # of those above. If you wish to allow use of your version of this file only # under the terms of either the GPL or the LGPL, and not to allow others to # use your version of this file under the terms of the MPL, indicate your # decision by deleting the provisions above and replace them with the notice # and other provisions required by the GPL or the LGPL. If you do not delete # the provisions above, a recipient may use your version of this file under # the terms of any one of the MPL, the GPL or the LGPL. # # ##### END LICENSE BLOCK ##### import optparse import os import subprocess import sys script_path = os.path.relpath(__file__) usage = 'Usage: {} \n' \ '\nEx: `{} ISO-8859-15`'.format(script_path, script_path) description = "Internal tool to generate a charset table." cmdline = optparse.OptionParser(usage, description = description) (options, charset) = cmdline.parse_args() if len(charset) != 1: sys.stderr.write("Please choose exactly one charset as argument.\n") exit(1) charset = charset[0] use_iconv = False try: b' '.decode(charset) except LookupError: use_iconv = True def get_utf8_char(bchar, charset, iconv): if iconv: try: call = subprocess.Popen(['iconv', '-f', charset, '-t', 'UTF-8'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) if call.poll() is not None: (_, error) = call.communicate(input='') sys.stderr.write('Error: `iconv` ended with error "{}".\n'.format(error)) exit(1) (uchar, _) = call.communicate(input=bchar) except FileNotFoundError: sys.stderr.write('Error: `iconv` is not installed.\n') exit(1) if len(uchar) > 0: return uchar.decode('UTF-8') else: return None else: try: return bchar.decode(charset) except UnicodeDecodeError: # Typical error: # UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 0: character maps to # It would mean an illegal character. return None print('## Table generated by {} with {} ##'.format(script_path, 'iconv' if use_iconv else 'Python decode()')) print('# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #') print('charmap = \\') sys.stdout.write('[') for l in range(0x10): sys.stdout.write('\n ') has_printable = False for c in range(0x10): char = bytes([c + l * 0x10]) char = get_utf8_char(char, charset, use_iconv) if char is None: sys.stdout.write('ILL,') elif char.isalpha(): sys.stdout.write('LET,') has_printable = True elif char.isdigit(): sys.stdout.write('NUM,') has_printable = True elif char == '\n' or char == '\r': sys.stdout.write('RET,') elif char.isprintable(): sys.stdout.write('SYM,') has_printable = True else: sys.stdout.write('CTR,') sys.stdout.write(' # {}X'.format(hex(l)[-1:].upper())) if has_printable: sys.stdout.write('\n #') # The line has at least one printable character. Print in comment for # debugging. for c in range(0x10): char = bytes([c + l * 0x10]) char = get_utf8_char(char, charset, use_iconv) if char is None: sys.stdout.write('ILL ') elif char == '\n' or char == '\r': sys.stdout.write('RET ') elif char.isalpha() or char.isdigit() or char.isprintable(): sys.stdout.write("'{}' ".format(char)) else: sys.stdout.write('CTR ') sys.stdout.write('\n]')