uchardet/script/create-table.py
Jehan bdd71d88f8 script: improve a bit create-table.py and regenerate the Georgian charsets.
- Avoid trailing whitespaces.
- Print which tool and version were used for the generation (to help for
  future debugging in case of discrepancies between versions or
  implementations).
2022-12-20 14:38:51 +01:00

153 lines
5.3 KiB
Python
Executable File

#!/bin/python3
# -*- coding: utf-8 -*-
# ##### BEGIN LICENSE BLOCK #####
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Jehan <jehan@girinstud.io>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ##### END LICENSE BLOCK #####
import optparse
import os
import subprocess
import sys
script_path = os.path.relpath(__file__)
usage = 'Usage: {} <CHARSET-NAME>\n' \
'\nEx: `{} ISO-8859-15`'.format(script_path, script_path)
description = "Internal tool to generate a charset table."
cmdline = optparse.OptionParser(usage, description = description)
(options, charset) = cmdline.parse_args()
if len(charset) != 1:
sys.stderr.write("Please choose exactly one charset as argument.\n")
exit(1)
charset = charset[0]
use_iconv = False
try:
b' '.decode(charset)
dec_version = 'Python {}'.format(sys.version).splitlines()[0]
except LookupError:
use_iconv = True
try:
call = subprocess.Popen(['iconv', '--version'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL)
if call.poll() is not None:
(_, error) = call.communicate(input='')
sys.stderr.write('Error: `iconv` ended with error "{}".\n'.format(error))
exit(1)
(dec_version, _) = call.communicate(input='')
dec_version = dec_version.decode('UTF-8').splitlines()[0]
except FileNotFoundError:
sys.stderr.write('Error: `iconv` is not installed.\n')
exit(1)
def get_utf8_char(bchar, charset, iconv):
if iconv:
try:
call = subprocess.Popen(['iconv', '-f', charset, '-t', 'UTF-8'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL)
if call.poll() is not None:
(_, error) = call.communicate(input='')
sys.stderr.write('Error: `iconv` ended with error "{}".\n'.format(error))
exit(1)
(uchar, _) = call.communicate(input=bchar)
except FileNotFoundError:
sys.stderr.write('Error: `iconv` is not installed.\n')
exit(1)
if len(uchar) > 0:
return uchar.decode('UTF-8')
else:
return None
else:
try:
return bchar.decode(charset)
except UnicodeDecodeError:
# Typical error:
# UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 0: character maps to <undefined>
# It would mean an illegal character.
return None
print('## Table generated by {} with: ##'.format(script_path))
print('## {} ##'.format(dec_version))
print('# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #')
print('charmap = \\')
sys.stdout.write('[')
for l in range(0x10):
sys.stdout.write('\n ')
has_printable = False
for c in range(0x10):
char = bytes([c + l * 0x10])
char = get_utf8_char(char, charset, use_iconv)
if char is None:
sys.stdout.write('ILL,')
elif char.isalpha():
sys.stdout.write('LET,')
has_printable = True
elif char.isdigit():
sys.stdout.write('NUM,')
has_printable = True
elif char == '\n' or char == '\r':
sys.stdout.write('RET,')
elif char.isprintable():
sys.stdout.write('SYM,')
has_printable = True
else:
sys.stdout.write('CTR,')
sys.stdout.write(' # {}X'.format(hex(l)[-1:].upper()))
if has_printable:
sys.stdout.write('\n#')
# The line has at least one printable character. Print in comment for
# debugging.
for c in range(0x10):
char = bytes([c + l * 0x10])
char = get_utf8_char(char, charset, use_iconv)
if char is None:
sys.stdout.write(' ILL')
elif char == '\n' or char == '\r':
sys.stdout.write(' RET')
elif char.isalpha() or char.isdigit() or char.isprintable():
sys.stdout.write(" '{}'".format(char))
else:
sys.stdout.write(' CTR')
sys.stdout.write('\n]')