mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 08:46:40 +08:00
script: new create-table script.
I wanted to add new tables for which I could find no listing anywhere, even though iconv has support for it (not core Python though), which are GEORGIAN-ACADEMY and GEORGIAN-PS. I could find info on these in libiconv source (./lib/georgian_academy.h and ./lib/georgian_ps.h), though rather than trying to read these, I thought I should just do the other way around: get back a table from the return value of iconv API (or Python decode() when relevant). So this script is able to generate tables in the format used under script/charsets/, from either Python decode() or iconv. It will be much useful!
This commit is contained in:
parent
419a971e6a
commit
c843d23a17
137
script/create-table.py
Executable file
137
script/create-table.py
Executable file
@ -0,0 +1,137 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import optparse
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
script_path = os.path.relpath(__file__)
|
||||
|
||||
usage = 'Usage: {} <CHARSET-NAME>\n' \
|
||||
'\nEx: `{} ISO-8859-15`'.format(script_path, script_path)
|
||||
|
||||
description = "Internal tool to generate a charset table."
|
||||
cmdline = optparse.OptionParser(usage, description = description)
|
||||
(options, charset) = cmdline.parse_args()
|
||||
if len(charset) != 1:
|
||||
sys.stderr.write("Please choose exactly one charset as argument.\n")
|
||||
exit(1)
|
||||
|
||||
charset = charset[0]
|
||||
|
||||
use_iconv = False
|
||||
try:
|
||||
b' '.decode(charset)
|
||||
except LookupError:
|
||||
use_iconv = True
|
||||
|
||||
def get_utf8_char(bchar, charset, iconv):
|
||||
if iconv:
|
||||
try:
|
||||
call = subprocess.Popen(['iconv', '-f', charset, '-t', 'UTF-8'],
|
||||
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.DEVNULL)
|
||||
if call.poll() is not None:
|
||||
(_, error) = call.communicate(input='')
|
||||
sys.stderr.write('Error: `iconv` ended with error "{}".\n'.format(error))
|
||||
exit(1)
|
||||
(uchar, _) = call.communicate(input=bchar)
|
||||
except FileNotFoundError:
|
||||
sys.stderr.write('Error: `iconv` is not installed.\n')
|
||||
exit(1)
|
||||
if len(uchar) > 0:
|
||||
return uchar.decode('UTF-8')
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
try:
|
||||
return bchar.decode(charset)
|
||||
except UnicodeDecodeError:
|
||||
# Typical error:
|
||||
# UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 0: character maps to <undefined>
|
||||
# It would mean an illegal character.
|
||||
return None
|
||||
|
||||
print('## Table generated by {} with {} ##'.format(script_path, 'iconv' if use_iconv else 'Python decode()'))
|
||||
|
||||
print('# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #')
|
||||
print('charmap = \\')
|
||||
sys.stdout.write('[')
|
||||
for l in range(0x10):
|
||||
sys.stdout.write('\n ')
|
||||
has_printable = False
|
||||
for c in range(0x10):
|
||||
char = bytes([c + l * 0x10])
|
||||
char = get_utf8_char(char, charset, use_iconv)
|
||||
if char is None:
|
||||
sys.stdout.write('ILL,')
|
||||
elif char.isalpha():
|
||||
sys.stdout.write('LET,')
|
||||
has_printable = True
|
||||
elif char.isdigit():
|
||||
sys.stdout.write('NUM,')
|
||||
has_printable = True
|
||||
elif char == '\n' or char == '\r':
|
||||
sys.stdout.write('RET,')
|
||||
elif char.isprintable():
|
||||
sys.stdout.write('SYM,')
|
||||
has_printable = True
|
||||
else:
|
||||
sys.stdout.write('CTR,')
|
||||
|
||||
sys.stdout.write(' # {}X'.format(hex(l)[-1:].upper()))
|
||||
|
||||
if has_printable:
|
||||
sys.stdout.write('\n #')
|
||||
# The line has at least one printable character. Print in comment for
|
||||
# debugging.
|
||||
for c in range(0x10):
|
||||
char = bytes([c + l * 0x10])
|
||||
char = get_utf8_char(char, charset, use_iconv)
|
||||
if char is None:
|
||||
sys.stdout.write('ILL ')
|
||||
elif char == '\n' or char == '\r':
|
||||
sys.stdout.write('RET ')
|
||||
elif char.isalpha() or char.isdigit() or char.isprintable():
|
||||
sys.stdout.write("'{}' ".format(char))
|
||||
else:
|
||||
sys.stdout.write('CTR ')
|
||||
sys.stdout.write('\n]')
|
||||
Loading…
x
Reference in New Issue
Block a user