script, src: generate more code for language and sequence model listing.

Right now, each time we add new language or new charset support, we have
too many pieces of code not to forget to edit. The script
script/BuildLangModel.py will now take care of the main parts: listing
the sequence models, listing the generic language models and computing
the numbers for each listing.

Furthermore the script will now end with a TODO list of the parts which
are still to be done manually (2 functions to edit and a CMakeLists).

Finally the script now allows to give a list of languages to edit rather
of having to run it with languages one by one. It also allows 2 special
code: "none", which will retrain none of the languages, but will
re-generate only the new generated listings; and "all" which will
retrain all models (useful in particulare when we change the model
formats or usage and want to regenerate everything).
This commit is contained in:
Jehan 2022-12-18 17:13:17 +01:00
parent d6cab28fb4
commit db836fad63
46 changed files with 1160 additions and 844 deletions

View File

@ -72,60 +72,83 @@ cmdline.add_option('--max-depth',
dest = 'max_depth', default = 2)
(options, langs) = cmdline.parse_args()
if len(langs) < 1:
print("Please select at least one language code.\n")
sys.stderr.write("Please select at least one language code. ")
sys.stderr.write("You may also choose 'all' or 'none'.\n")
exit(1)
if len(langs) > 1:
print("This script is meant to generate data for one language at a time.\n")
exit(1)
lang = langs[0]
# Load the language data.
sys_path_backup = sys.path
current_dir = os.path.dirname(os.path.realpath(__file__))
sys.path = [current_dir + '/langs']
try:
lang = importlib.import_module(lang.lower())
except ImportError:
print('Unknown language code "{}": '
'file "langs/{}.py" does not exist.'.format(lang, lang.lower()))
with open(os.path.join(current_dir, "support.txt")) as f:
all_langs = f.readlines()
all_langs = [ l.strip() for l in all_langs if l.strip() != '' ]
if len(langs) == 1:
if langs[0].lower() == 'none':
langs = []
elif langs[0].lower() == 'all':
langs = all_langs
abort = False
for lang in langs:
if lang not in all_langs:
abort = True
sys.stderr.write("Error: unsupported lang: {}\n".format(lang))
if abort:
sys.stderr.write("Info: new langs must be added in 'script/support.txt'.\n")
exit(1)
sys.path = sys_path_backup
charsets = charsets.db.load(lang.charsets)
generated_files = []
if not hasattr(lang, 'start_pages') or lang.start_pages is None or \
for lang_arg in langs:
lang_arg = lang_arg.lower()
# Load the language data.
sys_path_backup = sys.path
sys.path = [current_dir + '/langs']
try:
lang = importlib.import_module(lang_arg)
except ImportError:
sys.stderr.write('Unknown language code "{}": '
'file "langs/{}.py" does not exist.'.format(lang_arg, lang_arg))
exit(1)
sys.path = sys_path_backup
print("Processing language data for {} (lang/{}.py):\n".format(lang_arg, lang_arg))
lang_charsets = charsets.db.load(lang.charsets)
if not hasattr(lang, 'start_pages') or lang.start_pages is None or \
lang.start_pages == []:
# Let's start with the main page, assuming it should have links
# to relevant pages. In locale wikipedia, this page is usually redirected
# to a relevant page.
print("Warning: no `start_pages` set for '{}'. Using ['Main_Page'].\n"
sys.stderr.write("Warning: no `start_pages` set for '{}'. Using ['Main_Page'].\n"
" If you don't get good data, it is advised to set a "
"start_pages` variable yourself.".format(lang.code))
lang.start_pages = ['Main_Page']
if not hasattr(lang, 'wikipedia_code') or lang.wikipedia_code is None:
if not hasattr(lang, 'wikipedia_code') or lang.wikipedia_code is None:
lang.wikipedia_code = lang.code
if not hasattr(lang, 'clean_wikipedia_content') or lang.clean_wikipedia_content is None:
if not hasattr(lang, 'clean_wikipedia_content') or lang.clean_wikipedia_content is None:
lang.clean_wikipedia_content = None
if hasattr(lang, 'case_mapping'):
if hasattr(lang, 'case_mapping'):
lang.case_mapping = bool(lang.case_mapping)
else:
else:
lang.case_mapping = False
if not hasattr(lang, 'custom_case_mapping'):
if not hasattr(lang, 'custom_case_mapping'):
lang.custom_case_mapping = None
if not hasattr(lang, 'alphabet') or lang.alphabet is None:
if not hasattr(lang, 'alphabet') or lang.alphabet is None:
lang.alphabet = None
if not hasattr(lang, 'alphabet_mapping') or lang.alphabet_mapping is None:
if not hasattr(lang, 'alphabet_mapping') or lang.alphabet_mapping is None:
lang.alphabet_mapping = None
if not hasattr(lang, 'unicode_ranges') or lang.unicode_ranges is None:
if not hasattr(lang, 'unicode_ranges') or lang.unicode_ranges is None:
lang.unicode_ranges = None
if not hasattr(lang, 'frequent_ranges') or lang.frequent_ranges is None:
if not hasattr(lang, 'frequent_ranges') or lang.frequent_ranges is None:
if lang.unicode_ranges is not None:
lang.frequent_ranges = lang.unicode_ranges
else:
lang.frequent_ranges = None
def local_lowercase(text, lang):
def local_lowercase(text, lang):
lowercased = ''
for l in text:
if lang.custom_case_mapping is not None and \
@ -139,14 +162,14 @@ def local_lowercase(text, lang):
lowercased += l
return lowercased
if lang.use_ascii:
if lang.use_ascii:
if lang.alphabet is None:
lang.alphabet = [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
else:
# Allowing to provide an alphabet in string format rather than list.
lang.alphabet = list(lang.alphabet)
lang.alphabet += [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
if lang.alphabet is not None:
if lang.alphabet is not None:
# Allowing to provide an alphabet in string format rather than list.
lang.alphabet = list(lang.alphabet)
if lang.case_mapping or lang.custom_case_mapping is not None:
@ -165,7 +188,7 @@ if lang.alphabet is not None:
#alphabet.append(l)
lang.alphabet = list(set(lang.alphabet))
if lang.alphabet_mapping is not None:
if lang.alphabet_mapping is not None:
alphabet_mapping = {}
for char in lang.alphabet_mapping:
# Allowing to provide an alphabet in string format rather than list.
@ -179,7 +202,7 @@ if lang.alphabet_mapping is not None:
alphabet_mapping[alt_char] = char
lang.alphabet_mapping = alphabet_mapping
def normalize_codepoint_ranges(input_range):
def normalize_codepoint_ranges(input_range):
output_range = []
if input_range is not None:
for start, end in input_range:
@ -198,26 +221,26 @@ def normalize_codepoint_ranges(input_range):
output_range = None
return output_range
lang.unicode_ranges = normalize_codepoint_ranges(lang.unicode_ranges)
lang.frequent_ranges = normalize_codepoint_ranges(lang.frequent_ranges)
lang.unicode_ranges = normalize_codepoint_ranges(lang.unicode_ranges)
lang.frequent_ranges = normalize_codepoint_ranges(lang.frequent_ranges)
# Starting processing.
wikipedia.set_lang(lang.wikipedia_code)
# Starting processing.
wikipedia.set_lang(lang.wikipedia_code)
visited_pages = []
visited_pages = []
# The full list of letter characters.
# The key is the unicode codepoint,
# and the value is the occurrence count.
characters = {}
# Sequence of letters.
# The key is the couple (char1, char2) in unicode codepoint,
# the value is the occurrence count.
sequences = {}
prev_char = None
# The full list of letter characters.
# The key is the unicode codepoint,
# and the value is the occurrence count.
characters = {}
# Sequence of letters.
# The key is the couple (char1, char2) in unicode codepoint,
# the value is the occurrence count.
sequences = {}
prev_char = None
def process_text(content, lang):
global charsets
def process_text(content, lang):
global lang_charsets
global characters
global sequences
global prev_char
@ -256,7 +279,7 @@ def process_text(content, lang):
else:
# We save the character if it is at least in one of the
# language encodings and its not a special character.
for charset in charsets:
for charset in lang_charsets:
# Does the character exist in the charset?
try:
codepoint = char.encode(charset, 'ignore')
@ -268,11 +291,11 @@ def process_text(content, lang):
stderr=subprocess.DEVNULL)
if call.poll() is not None:
(_, error) = call.communicate(input='')
print('Error: `iconv` ended with error "{}".\n'.format(error))
sys.stderr.write('Error: `iconv` ended with error "{}".\n'.format(error))
exit(1)
(codepoint, _) = call.communicate(input=char.encode('UTF-8'))
except FileNotFoundError:
print('Error: "{}" is not a supported charset by python and `iconv` is not installed.\n')
sys.stderr.write('Error: "{}" is not a supported charset by python and `iconv` is not installed.\n')
exit(1)
if codepoint == b'':
@ -282,7 +305,7 @@ def process_text(content, lang):
# charsets if I turn the string to encoded bytes first.
# Not sure if that is a bug or expected.
codepoint = ord(codepoint)
if charsets[charset].charmap[codepoint] == LET:
if lang_charsets[charset].charmap[codepoint] == LET:
characters[unicode_value] = 1
is_letter = True
break
@ -296,7 +319,7 @@ def process_text(content, lang):
else:
prev_char = None
def visit_pages(titles, depth, lang, logfd):
def visit_pages(titles, depth, lang, logfd):
global visited_pages
global options
@ -317,7 +340,7 @@ def visit_pages(titles, depth, lang, logfd):
# Ugly hack skipping internal pages
if 'wiki' in title or 'Wiki' in title:
print('Skipping', title)
sys.stderr.write('Skipping', title)
continue
visited_pages += [title]
@ -326,7 +349,7 @@ def visit_pages(titles, depth, lang, logfd):
except (wikipedia.exceptions.PageError,
wikipedia.exceptions.DisambiguationError) as error:
# Let's just discard a page when I get an exception.
print("Discarding page {}: {}\n".format(title, error))
sys.stderr.write("Discarding page {}: {}\n".format(title, error))
continue
logfd.write("\n{} (revision {})".format(title, page.revision_id))
logfd.flush()
@ -347,54 +370,54 @@ def visit_pages(titles, depth, lang, logfd):
random.shuffle(next_titles)
visit_pages (next_titles, depth + 1, lang, logfd)
language_c = lang.name.replace('-', '_').title()
build_log = current_dir + '/BuildLangModelLogs/Lang{}Model.log'.format(language_c)
logfd = open(build_log, 'w')
logfd.write('= Logs of language model for {} ({}) =\n'.format(lang.name, lang.code))
logfd.write('\n- Generated by {}'.format(os.path.basename(__file__)))
logfd.write('\n- Started: {}'.format(str(datetime.datetime.now())))
logfd.write('\n- Maximum depth: {}'.format(options.max_depth))
if options.max_page is not None:
language_c = lang.name.replace('-', '_').title()
build_log = current_dir + '/BuildLangModelLogs/Lang{}Model.log'.format(language_c)
logfd = open(build_log, 'w')
logfd.write('= Logs of language model for {} ({}) =\n'.format(lang.name, lang.code))
logfd.write('\n- Generated by {}'.format(os.path.basename(__file__)))
logfd.write('\n- Started: {}'.format(str(datetime.datetime.now())))
logfd.write('\n- Maximum depth: {}'.format(options.max_depth))
if options.max_page is not None:
logfd.write('\n- Max number of pages: {}'.format(options.max_page))
logfd.write('\n\n== Parsed pages ==\n')
logfd.flush()
try:
logfd.write('\n\n== Parsed pages ==\n')
logfd.flush()
try:
visit_pages(lang.start_pages, 0, lang, logfd)
except requests.exceptions.ConnectionError:
print('Error: connection to Wikipedia failed. Aborting\n')
except requests.exceptions.ConnectionError:
sys.stderr.write('Error: connection to Wikipedia failed. Aborting\n')
exit(1)
logfd.write('\n\n== End of Parsed pages ==')
logfd.write('\n\n- Wikipedia parsing ended at: {}\n'.format(str(datetime.datetime.now())))
logfd.flush()
logfd.write('\n\n== End of Parsed pages ==')
logfd.write('\n\n- Wikipedia parsing ended at: {}\n'.format(str(datetime.datetime.now())))
logfd.flush()
########### CHARACTERS ###########
########### CHARACTERS ###########
# Character ratios.
ratios = {}
n_char = len(characters)
occurrences = sum(characters.values())
# Character ratios.
ratios = {}
n_char = len(characters)
occurrences = sum(characters.values())
logfd.write("\n{} characters appeared {} times.\n".format(n_char, occurrences))
for char in characters:
logfd.write("\n{} characters appeared {} times.\n".format(n_char, occurrences))
for char in characters:
ratios[char] = characters[char] / occurrences
#logfd.write("Character '{}' usage: {} ({} %)\n".format(chr(char),
# characters[char],
# ratios[char] * 100))
sorted_ratios = sorted(ratios.items(), key=operator.itemgetter(1),
sorted_ratios = sorted(ratios.items(), key=operator.itemgetter(1),
reverse=True)
# Accumulated ratios of the frequent chars.
accumulated_ratios = 0
# Accumulated ratios of the frequent chars.
accumulated_ratios = 0
# If there is no alphabet defined, we just use the first 64 letters, which was
# the original default.
# If there is an alphabet, we make sure all the alphabet characters are in the
# frequent list, and we stop then. There may therefore be more or less than
# 64 frequent characters depending on the language.
logfd.write('\nMost Frequent characters:')
very_freq_count = 0
very_freq_ratio = 0
if lang.alphabet is None and lang.frequent_ranges is None:
# If there is no alphabet defined, we just use the first 64 letters, which was
# the original default.
# If there is an alphabet, we make sure all the alphabet characters are in the
# frequent list, and we stop then. There may therefore be more or less than
# 64 frequent characters depending on the language.
logfd.write('\nMost Frequent characters:')
very_freq_count = 0
very_freq_ratio = 0
if lang.alphabet is None and lang.frequent_ranges is None:
freq_count = min(64, len(sorted_ratios))
for order, (char, ratio) in enumerate(sorted_ratios):
if order >= freq_count:
@ -404,7 +427,7 @@ if lang.alphabet is None and lang.frequent_ranges is None:
if very_freq_ratio < 0.4:
very_freq_count += 1
very_freq_ratio += ratio
elif lang.alphabet is not None:
elif lang.alphabet is not None:
freq_count = 0
for order, (char, ratio) in enumerate(sorted_ratios):
if len(lang.alphabet) == 0:
@ -419,11 +442,11 @@ elif lang.alphabet is not None:
very_freq_ratio += ratio
else:
if len(lang.alphabet) > 0:
print("Error: alphabet characters are absent from data collection"
sys.stderr.write("Error: alphabet characters are absent from data collection"
"\n Please check the configuration or the data."
"\n Missing characters: {}".format(", ".join(lang.alphabet)))
exit(1)
elif lang.frequent_ranges is not None:
elif lang.frequent_ranges is not None:
# How many characters in the frequent range?
frequent_ranges_size = 0
for start, end in lang.frequent_ranges:
@ -453,30 +476,34 @@ elif lang.frequent_ranges is not None:
if frequent_ranges_size <= 0:
break
low_freq_order = freq_count - 1
low_freq_ratio = 0
for back_order, (char, ratio) in enumerate(reversed(sorted_ratios[:freq_count])):
low_freq_order = freq_count - 1
low_freq_ratio = 0
for back_order, (char, ratio) in enumerate(reversed(sorted_ratios[:freq_count])):
if low_freq_ratio < 0.03:
low_freq_ratio += ratio
low_freq_order -= 1
else:
break
logfd.write("\n\nThe first {} characters have an accumulated ratio of {}.\n".format(freq_count, accumulated_ratios))
logfd.write("The first {} characters have an accumulated ratio of {}.\n".format(very_freq_count, very_freq_ratio))
logfd.write("All characters whose order is over {} have an accumulated ratio of {}.\n".format(low_freq_order, low_freq_ratio))
logfd.write("\n\nThe first {} characters have an accumulated ratio of {}.\n".format(freq_count, accumulated_ratios))
logfd.write("The first {} characters have an accumulated ratio of {}.\n".format(very_freq_count, very_freq_ratio))
logfd.write("All characters whose order is over {} have an accumulated ratio of {}.\n".format(low_freq_order, low_freq_ratio))
with open(current_dir + '/header-template.cpp', 'r') as header_fd:
with open(current_dir + '/header-template.cpp', 'r') as header_fd:
c_code = header_fd.read()
c_code += '\n/********* Language model for: {} *********/\n\n'.format(lang.name)
c_code += '/**\n * Generated by {}\n'.format(os.path.basename(__file__))
c_code += ' * On: {}\n'.format(str(datetime.datetime.now()))
c_code += ' **/\n'
c_code += '\n#include "../nsSBCharSetProber.h"'
c_code += '\n#include "../nsSBCharSetProber-generated.h"'
c_code += '\n#include "../nsLanguageDetector.h"\n'
c_code += '\n#include "../nsLanguageDetector-generated.h"\n'
c_code += '\n/********* Language model for: {} *********/\n\n'.format(lang.name)
c_code += '/**\n * Generated by {}\n'.format(os.path.basename(__file__))
c_code += ' * On: {}\n'.format(str(datetime.datetime.now()))
c_code += ' **/\n'
c_code += \
"""
/* Character Mapping Table:
c_code += \
"""
/* Character Mapping Table:
* ILL: illegal character.
* CTR: control character specific to the charset.
* RET: carriage/return.
@ -493,9 +520,9 @@ c_code += \
* ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
* even though they are both used for French. Same for the euro sign.
*/
"""
"""
for charset in charsets:
for charset in lang_charsets:
charset_c = charset.replace('-', '_').title()
CTOM_str = 'static const unsigned char {}_CharToOrderMap[]'.format(charset_c)
CTOM_str += ' =\n{'
@ -503,7 +530,7 @@ for charset in charsets:
CTOM_str += '\n '
for column in range(0, 16):
cp = line * 16 + column
cp_type = charsets[charset].charmap[cp]
cp_type = lang_charsets[charset].charmap[cp]
if cp_type == ILL:
CTOM_str += 'ILL,'
elif cp_type == RET:
@ -518,8 +545,8 @@ for charset in charsets:
try:
uchar = bytes([cp]).decode(charset)
except UnicodeDecodeError:
print('Unknown character 0X{:X} in {}.'.format(cp, charset))
print('Please verify your charset specification.\n')
sys.stderr.write('Unknown character 0X{:X} in {}.'.format(cp, charset))
sys.stderr.write('Please verify your charset specification.\n')
exit(1)
except LookupError:
# Unknown encoding. Use iconv instead.
@ -530,15 +557,15 @@ for charset in charsets:
stderr=subprocess.PIPE)
if call.poll() is not None:
(_, error) = call.communicate(input='')
print('Error: `iconv` ended with error "{}".\n'.format(error))
sys.stderr.write('Error: `iconv` ended with error "{}".\n'.format(error))
exit(1)
(uchar, _) = call.communicate(input=bytes([cp]))
uchar = uchar.decode('UTF-8')
except FileNotFoundError:
print('Error: "{}" is not a supported charset by python and `iconv` is not installed.\n')
sys.stderr.write('Error: "{}" is not a supported charset by python and `iconv` is not installed.\n')
exit(1)
if len(uchar) == 0:
print('TypeError: iconv failed to return a unicode character for codepoint "{}" in charset {}.\n'.format(hex(cp), charset))
sys.stderr.write('TypeError: iconv failed to return a unicode character for codepoint "{}" in charset {}.\n'.format(hex(cp), charset))
exit(1)
#if lang.case_mapping and uchar.isupper() and \
#len(unicodedata.normalize('NFC', uchar.lower())) == 1:
@ -569,24 +596,24 @@ for charset in charsets:
CTOM_str += ' */\n\n'
c_code += CTOM_str
## UNICODE frequency.
## UNICODE frequency.
# Since we can't map the full character table from encoding to order,
# just create a list from the most common characters from the language.
# The list is ordered by unicode code points (hence can be used
# generically for various encoding scheme as it is not encoding
# specific) allowing to search from code points efficiently by a divide
# and conqueer search algorithm.
# Each code point is immediately followed by its order.
# Since we can't map the full character table from encoding to order,
# just create a list from the most common characters from the language.
# The list is ordered by unicode code points (hence can be used
# generically for various encoding scheme as it is not encoding
# specific) allowing to search from code points efficiently by a divide
# and conqueer search algorithm.
# Each code point is immediately followed by its order.
# Keep the freq_count more frequent characters.
sorted_chars = [(char, freq, order) for order, (char, freq) in
# Keep the freq_count more frequent characters.
sorted_chars = [(char, freq, order) for order, (char, freq) in
enumerate(sorted_ratios)][:freq_count]
max_order = len(sorted_chars)
max_order = len(sorted_chars)
# Add equivalency characters.
equivalent = []
if lang.case_mapping:
# Add equivalency characters.
equivalent = []
if lang.case_mapping:
for char, ratio, order in sorted_chars:
uppercased = chr(char).upper()
try:
@ -597,7 +624,7 @@ if lang.case_mapping:
# Just ignore such cases.
sys.stderr.write("Ignoring '{}' as uppercase equivalent of '{}'.\n".format(uppercased, char))
if lang.alphabet_mapping is not None:
if lang.alphabet_mapping is not None:
for alt_c in lang.alphabet_mapping:
for char, ratio, order in sorted_chars:
if alt_c == chr(char):
@ -610,45 +637,45 @@ if lang.alphabet_mapping is not None:
sys.stderr.write("Base equivalent for {} not found in frequent characters!\n".format(alt_c))
exit(1)
sorted_chars += equivalent
sorted_chars += equivalent
# Order by code point.
sorted_chars = sorted(sorted_chars, key=operator.itemgetter(0))
# Order by code point.
sorted_chars = sorted(sorted_chars, key=operator.itemgetter(0))
CTOM_str = 'static const int Unicode_Char_size = {};\n'.format(len(sorted_chars))
CTOM_str = 'static const int Unicode_Char_size = {};\n'.format(len(sorted_chars))
CTOM_str += 'static const unsigned int Unicode_CharOrder[]'
CTOM_str += ' =\n{'
column = 0
CTOM_str += 'static const unsigned int Unicode_CharOrder[]'
CTOM_str += ' =\n{'
column = 0
max_char_width = math.floor(math.log10(sorted_chars[-1][0])) + 1
max_order_width = math.floor(math.log10(max_order)) + 1
max_char_width = math.floor(math.log10(sorted_chars[-1][0])) + 1
max_order_width = math.floor(math.log10(max_order)) + 1
for char, ratio, order in sorted_chars:
for char, ratio, order in sorted_chars:
if column % 8 == 0:
CTOM_str += '\n '
column += 1
CTOM_str += '{}{:>{width}}, '.format('' if column % 8 == 0 else ' ', char, width=max_char_width)
CTOM_str += '{:>{width}},'.format(order, width=max_order_width)
CTOM_str += '\n};\n\n'
c_code += CTOM_str
CTOM_str += '\n};\n\n'
c_code += CTOM_str
########### SEQUENCES ###########
########### SEQUENCES ###########
ratios = {}
occurrences = sum(sequences.values())
ratios = {}
occurrences = sum(sequences.values())
accumulated_seq_count = 0
order_3 = -1
order_2 = -1
ratio_3 = -1
ratio_2 = -1
count_512 = -1
count_1024 = -1
sorted_seqs = sorted(sequences.items(), key=operator.itemgetter(1),
accumulated_seq_count = 0
order_3 = -1
order_2 = -1
ratio_3 = -1
ratio_2 = -1
count_512 = -1
count_1024 = -1
sorted_seqs = sorted(sequences.items(), key=operator.itemgetter(1),
reverse=True)
for order, ((c1, c2), count) in enumerate(sorted_seqs):
for order, ((c1, c2), count) in enumerate(sorted_seqs):
accumulated_seq_count += count
if order_3 == -1 and accumulated_seq_count / occurrences >= 0.995:
order_3 = order
@ -664,7 +691,7 @@ for order, ((c1, c2), count) in enumerate(sorted_seqs):
if order_3 != -1 and order_2 != -1:
break
if order_3 == -1 or order_2 == -1:
if order_3 == -1 or order_2 == -1:
# This would probably never happens. It would require a language with
# very few possible sequences and each of the sequences are widely
# used. Just add this code for completio, but it won't likely ever be
@ -674,10 +701,10 @@ if order_3 == -1 or order_2 == -1:
ratio_2 = count_512 / occurrences
ratio_3 = count_1024 / occurrences
logfd.write("\n{} sequences found.\n".format(len(sorted_seqs)))
logfd.write("\n{} sequences found.\n".format(len(sorted_seqs)))
c_code += """
/* Model Table:
c_code += """
/* Model Table:
* Total considered sequences: {} / {}
* - Positive sequences: first {} ({})
* - Probable sequences: next {} ({}-{}) ({})
@ -693,17 +720,17 @@ c_code += """
1 - ratio_2,
freq_count * freq_count - len(sorted_seqs))
logfd.write("\nFirst {} (typical positive ratio): {}".format(order_3, ratio_3))
logfd.write("\nNext {} ({}-{}): {}".format(order_2 - order_3,
logfd.write("\nFirst {} (typical positive ratio): {}".format(order_3, ratio_3))
logfd.write("\nNext {} ({}-{}): {}".format(order_2 - order_3,
order_2, order_3,
ratio_2 - ratio_3))
logfd.write("\nRest: {}".format(1 - ratio_2))
logfd.write("\nRest: {}".format(1 - ratio_2))
c_code += "\n */\n"
c_code += "\n */\n"
LM_str = 'static const PRUint8 {}LangModel[]'.format(language_c)
LM_str += ' =\n{'
for line in range(0, freq_count):
LM_str = 'static const PRUint8 {}LangModel[]'.format(language_c)
LM_str += ' =\n{'
for line in range(0, freq_count):
LM_str += '\n '
for column in range(0, freq_count):
# Let's not make too long lines.
@ -733,10 +760,10 @@ for line in range(0, freq_count):
# It may indeed happen that we find less than 64 letters used for a
# given language.
LM_str += '0,'
LM_str += '\n};\n'
c_code += LM_str
LM_str += '\n};\n'
c_code += LM_str
for charset in charsets:
for charset in lang_charsets:
charset_c = charset.replace('-', '_').title()
SM_str = '\n\nconst SequenceModel {}{}Model ='.format(charset_c, language_c)
SM_str += '\n{\n '
@ -749,29 +776,112 @@ for charset in charsets:
SM_str += '\n};'
c_code += SM_str
SM_str = '\n\nconst LanguageModel {}Model ='.format(language_c)
SM_str += '\n{'
SM_str += '\n "{}",'.format(lang.code)
SM_str += '\n Unicode_CharOrder,'
SM_str += '\n {},'.format(len(sorted_chars)) # Order is wrong!
SM_str += '\n {}LangModel,'.format(language_c)
SM_str += '\n {},'.format(freq_count)
SM_str += '\n {},'.format(very_freq_count)
SM_str += '\n (float){},'.format(very_freq_ratio)
SM_str += '\n {},'.format(low_freq_order)
SM_str += '\n (float){},'.format(low_freq_ratio)
SM_str += '\n};'
c_code += SM_str
SM_str = '\n\nconst LanguageModel {}Model ='.format(language_c)
SM_str += '\n{'
SM_str += '\n "{}",'.format(lang.code)
SM_str += '\n Unicode_CharOrder,'
SM_str += '\n {},'.format(len(sorted_chars)) # Order is wrong!
SM_str += '\n {}LangModel,'.format(language_c)
SM_str += '\n {},'.format(freq_count)
SM_str += '\n {},'.format(very_freq_count)
SM_str += '\n (float){},'.format(very_freq_ratio)
SM_str += '\n {},'.format(low_freq_order)
SM_str += '\n (float){},'.format(low_freq_ratio)
SM_str += '\n};'
c_code += SM_str
c_code += '\n'
c_code += '\n'
lang_model_file = current_dir + '/../src/LangModels/Lang{}Model.cpp'.format(language_c)
with open(lang_model_file, 'w') as cpp_fd:
lang_model_file = current_dir + '/../src/LangModels/Lang{}Model.cpp'.format(language_c)
with open(lang_model_file, 'w') as cpp_fd:
cpp_fd.write(c_code)
logfd.write('\n\n- Processing end: {}\n'.format(str(datetime.datetime.now())))
logfd.close()
logfd.write('\n\n- Processing end: {}\n'.format(str(datetime.datetime.now())))
logfd.close()
print("The following language model file has been generated: {}"
"\nThe build log is available in: {}"
"\nTest them and commit them.".format(lang_model_file, build_log))
generated_files += [ (lang_model_file, build_log) ]
charset_cpp = os.path.join(current_dir, '../src', 'nsSBCharSetProber-generated.h')
print("\nGenerating {}".format(charset_cpp))
with open(charset_cpp, 'w') as cpp_fd:
with open(current_dir + '/header-template.cpp', 'r') as header_fd:
cpp_fd.write(header_fd.read())
cpp_fd.write('\n#ifndef nsSingleByteCharSetProber_generated_h__')
cpp_fd.write('\n#define nsSingleByteCharSetProber_generated_h__\n')
all_extern_declarations = ''
n_sequence_models = 0
for l in all_langs:
l = l.lower()
# Load the language data.
sys_path_backup = sys.path
sys.path = [current_dir + '/langs']
try:
lang = importlib.import_module(l)
except ImportError:
sys.stderr.write('Unknown language code "{}": '
'file "langs/{}.py" does not exist.'.format(l, l))
exit(1)
sys.path = sys_path_backup
language_c = lang.name.replace('-', '_').title()
lang_charsets = charsets.db.load(lang.charsets)
for charset in lang_charsets:
charset_c = charset.replace('-', '_').title()
all_extern_declarations += '\nextern const SequenceModel {}{}Model;'.format(charset_c, language_c)
n_sequence_models += 1
all_extern_declarations += '\n'
cpp_fd.write('\n#define NUM_OF_SEQUENCE_MODELS {}\n'.format(n_sequence_models))
cpp_fd.write('{}'.format(all_extern_declarations))
cpp_fd.write('\n#endif /* nsSingleByteCharSetProber_generated_h__ */')
print("Done!")
language_cpp = os.path.join(current_dir, '../src', 'nsLanguageDetector-generated.h')
print("\nGenerating {}".format(language_cpp))
with open(language_cpp, 'w') as cpp_fd:
with open(current_dir + '/header-template.cpp', 'r') as header_fd:
cpp_fd.write(header_fd.read())
cpp_fd.write('\n#ifndef nsLanguageDetector_h_generated_h__')
cpp_fd.write('\n#define nsLanguageDetector_h_generated_h__\n')
all_extern_declarations = ''
n_language_models = 0
for l in all_langs:
l = l.lower()
# Load the language data.
sys_path_backup = sys.path
sys.path = [current_dir + '/langs']
try:
lang = importlib.import_module(l)
except ImportError:
sys.stderr.write('Unknown language code "{}": '
'file "langs/{}.py" does not exist.'.format(l, l))
exit(1)
sys.path = sys_path_backup
language_c = lang.name.replace('-', '_').title()
all_extern_declarations += '\nextern const LanguageModel {}Model;'.format(language_c)
n_language_models += 1
cpp_fd.write('\n#define NUM_OF_LANGUAGE_MODELS {}\n'.format(n_language_models))
cpp_fd.write('{}'.format(all_extern_declarations))
cpp_fd.write('\n\n#endif /* nsLanguageDetector_h_generated_h__ */')
print("Done!")
if len(generated_files) > 0:
print("\nThe following language files has been generated:")
for (lang_model_file, build_log) in generated_files:
print("\n- Language file: {}".format(lang_model_file))
print("\n Build log: {}".format(build_log))
print("\nTODO:")
print("- edit nsSBCSGroupProber::nsSBCSGroupProber() in src/nsSBCSGroupProber.cpp manually to test new sequence models;")
print("- edit nsMBCSGroupProber::nsMBCSGroupProber() in src/nsMBCSGroupProber.cpp manually to test new language models;")
print("- add any new language files to src/CMakeLists.txt;")
print("- commit generated files if tests are successful.")

View File

@ -34,6 +34,3 @@
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsLanguageDetector.h"

36
script/support.txt Normal file
View File

@ -0,0 +1,36 @@
ar
be
bg
cs
da
de
el
en
eo
es
et
fi
fr
ga
he
hi
hr
hu
it
lt
lv
mk
mt
no
pl
pt
ro
ru
sk
sl
sr
sv
th
tr
uk
vi

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Arabic *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Belarusian *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Bulgarian *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Croatian *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Czech *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Danish *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: English *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Esperanto *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Estonian *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Finnish *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: French *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: German *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Greek *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Hebrew *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Hindi *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Hungarian *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Irish *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Italian *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Latvian *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Lithuanian *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Macedonian *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Maltese *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Norwegian *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Polish *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Portuguese *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Romanian *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Russian *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Serbian *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Slovak *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Slovene *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Spanish *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Swedish *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Thai *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Turkish *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Ukrainian *********/

View File

@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Vietnamese *********/

View File

@ -0,0 +1,80 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Communicator client code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef nsLanguageDetector_h_generated_h__
#define nsLanguageDetector_h_generated_h__
#define NUM_OF_LANGUAGE_MODELS 36
extern const LanguageModel ArabicModel;
extern const LanguageModel BelarusianModel;
extern const LanguageModel BulgarianModel;
extern const LanguageModel CzechModel;
extern const LanguageModel DanishModel;
extern const LanguageModel GermanModel;
extern const LanguageModel GreekModel;
extern const LanguageModel EnglishModel;
extern const LanguageModel EsperantoModel;
extern const LanguageModel SpanishModel;
extern const LanguageModel EstonianModel;
extern const LanguageModel FinnishModel;
extern const LanguageModel FrenchModel;
extern const LanguageModel IrishModel;
extern const LanguageModel HebrewModel;
extern const LanguageModel HindiModel;
extern const LanguageModel CroatianModel;
extern const LanguageModel HungarianModel;
extern const LanguageModel ItalianModel;
extern const LanguageModel LithuanianModel;
extern const LanguageModel LatvianModel;
extern const LanguageModel MacedonianModel;
extern const LanguageModel MalteseModel;
extern const LanguageModel NorwegianModel;
extern const LanguageModel PolishModel;
extern const LanguageModel PortugueseModel;
extern const LanguageModel RomanianModel;
extern const LanguageModel RussianModel;
extern const LanguageModel SlovakModel;
extern const LanguageModel SloveneModel;
extern const LanguageModel SerbianModel;
extern const LanguageModel SwedishModel;
extern const LanguageModel ThaiModel;
extern const LanguageModel TurkishModel;
extern const LanguageModel UkrainianModel;
extern const LanguageModel VietnameseModel;
#endif /* nsLanguageDetector_h_generated_h__ */

View File

@ -125,41 +125,4 @@ private:
int GetOrderFromCodePoint(int codePoint);
};
extern const LanguageModel ArabicModel;
extern const LanguageModel BelarusianModel;
extern const LanguageModel BulgarianModel;
extern const LanguageModel CroatianModel;
extern const LanguageModel CzechModel;
extern const LanguageModel DanishModel;
extern const LanguageModel EnglishModel;
extern const LanguageModel EsperantoModel;
extern const LanguageModel EstonianModel;
extern const LanguageModel FinnishModel;
extern const LanguageModel FrenchModel;
extern const LanguageModel GermanModel;
extern const LanguageModel GreekModel;
extern const LanguageModel HebrewModel;
extern const LanguageModel HindiModel;
extern const LanguageModel HungarianModel;
extern const LanguageModel IrishModel;
extern const LanguageModel ItalianModel;
extern const LanguageModel LatvianModel;
extern const LanguageModel LithuanianModel;
extern const LanguageModel MacedonianModel;
extern const LanguageModel MalteseModel;
extern const LanguageModel NorwegianModel;
extern const LanguageModel PolishModel;
extern const LanguageModel PortugueseModel;
extern const LanguageModel RomanianModel;
extern const LanguageModel RussianModel;
extern const LanguageModel SerbianModel;
extern const LanguageModel SlovakModel;
extern const LanguageModel SloveneModel;
extern const LanguageModel SpanishModel;
extern const LanguageModel SwedishModel;
extern const LanguageModel ThaiModel;
extern const LanguageModel TurkishModel;
extern const LanguageModel UkrainianModel;
extern const LanguageModel VietnameseModel;
#endif /* nsLanguageDetector_h__ */

View File

@ -48,8 +48,11 @@
#include "nsBig5Prober.h"
#include "nsEUCTWProber.h"
#include "nsLanguageDetector-generated.h"
#define NUM_OF_PROBERS 8
#define NUM_OF_LANGUAGES 37
/* All the generated language model + the CJK detector. */
#define NUM_OF_LANGUAGES (NUM_OF_LANGUAGE_MODELS + 1)
class nsMBCSGroupProber: public nsCharSetProber {
public:

View File

@ -36,10 +36,12 @@
*
* ***** END LICENSE BLOCK ***** */
#include <assert.h>
#include <stdio.h>
#include "prmem.h"
#include "nsSBCharSetProber.h"
#include "nsSBCharSetProber-generated.h"
#include "nsSBCSGroupProber.h"
#include "nsHebrewProber.h"
@ -50,6 +52,14 @@ nsSBCSGroupProber::nsSBCSGroupProber()
PRUint32 heb_prober_idx;
PRUint32 n = 0;
/* We create more probers than sequence models because of Hebrew handling,
* making Windows_1255HebrewModel and Ibm862HebrewModel used twice, while
* Iso_8859_8HebrewModel is currently unused.
*/
n_sbcs_probers = NUM_OF_SEQUENCE_MODELS + 2;
mProbers = new nsCharSetProber*[n_sbcs_probers];
mIsActive = new PRBool[n_sbcs_probers];
mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1251RussianModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Koi8_RRussianModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5RussianModel);
@ -226,15 +236,19 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[n++] = new nsSingleByteCharSetProber(&Ibm855MacedonianModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5MacedonianModel);
assert (n_sbcs_probers == n);
Reset();
}
nsSBCSGroupProber::~nsSBCSGroupProber()
{
for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
for (PRUint32 i = 0; i < n_sbcs_probers; i++)
{
delete mProbers[i];
}
delete mProbers;
delete mIsActive;
}
@ -266,7 +280,7 @@ const char* nsSBCSGroupProber::GetLanguage(int candidate)
void nsSBCSGroupProber::Reset(void)
{
mActiveNum = 0;
for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
for (PRUint32 i = 0; i < n_sbcs_probers; i++)
{
if (mProbers[i]) // not null
{
@ -303,7 +317,7 @@ nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
if (newLen1 == 0)
goto done; // Nothing to see here, move on.
for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
for (i = 0; i < n_sbcs_probers; i++)
{
if (!mIsActive[i])
continue;
@ -344,7 +358,7 @@ float nsSBCSGroupProber::GetConfidence(int candidate)
case eNotMe:
return (float)0.01; //sure no
default:
for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
for (i = 0; i < n_sbcs_probers; i++)
{
if (!mIsActive[i])
continue;
@ -367,7 +381,7 @@ void nsSBCSGroupProber::DumpStatus()
cf = GetConfidence(0);
printf(" SBCS Group Prober --------begin status \r\n");
for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
for (i = 0; i < n_sbcs_probers; i++)
{
if (!mIsActive[i])
printf(" inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName(0));

View File

@ -39,9 +39,6 @@
#ifndef nsSBCSGroupProber_h__
#define nsSBCSGroupProber_h__
#define NUM_OF_SBCS_PROBERS 117
class nsCharSetProber;
class nsSBCSGroupProber: public nsCharSetProber {
public:
@ -64,11 +61,11 @@ public:
protected:
nsProbingState mState;
nsCharSetProber* mProbers[NUM_OF_SBCS_PROBERS];
PRBool mIsActive[NUM_OF_SBCS_PROBERS];
nsCharSetProber **mProbers;
PRBool *mIsActive;
PRInt32 mBestGuess;
PRUint32 mActiveNum;
PRUint32 n_sbcs_probers;
};
#endif /* nsSBCSGroupProber_h__ */

View File

@ -0,0 +1,194 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Communicator client code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef nsSingleByteCharSetProber_generated_h__
#define nsSingleByteCharSetProber_generated_h__
#define NUM_OF_SEQUENCE_MODELS 115
extern const SequenceModel Iso_8859_6ArabicModel;
extern const SequenceModel Windows_1256ArabicModel;
extern const SequenceModel Windows_1251BelarusianModel;
extern const SequenceModel Iso_8859_5BelarusianModel;
extern const SequenceModel Windows_1251BulgarianModel;
extern const SequenceModel Iso_8859_5BulgarianModel;
extern const SequenceModel Iso_8859_2CzechModel;
extern const SequenceModel Windows_1250CzechModel;
extern const SequenceModel Ibm852CzechModel;
extern const SequenceModel Mac_CentraleuropeCzechModel;
extern const SequenceModel Iso_8859_15DanishModel;
extern const SequenceModel Iso_8859_1DanishModel;
extern const SequenceModel Windows_1252DanishModel;
extern const SequenceModel Ibm865DanishModel;
extern const SequenceModel Iso_8859_1GermanModel;
extern const SequenceModel Windows_1252GermanModel;
extern const SequenceModel Iso_8859_7GreekModel;
extern const SequenceModel Windows_1253GreekModel;
extern const SequenceModel Iso_8859_1EnglishModel;
extern const SequenceModel Windows_1252EnglishModel;
extern const SequenceModel Iso_8859_3EsperantoModel;
extern const SequenceModel Iso_8859_15SpanishModel;
extern const SequenceModel Iso_8859_1SpanishModel;
extern const SequenceModel Windows_1252SpanishModel;
extern const SequenceModel Iso_8859_4EstonianModel;
extern const SequenceModel Iso_8859_13EstonianModel;
extern const SequenceModel Iso_8859_15EstonianModel;
extern const SequenceModel Windows_1252EstonianModel;
extern const SequenceModel Windows_1257EstonianModel;
extern const SequenceModel Iso_8859_1FinnishModel;
extern const SequenceModel Iso_8859_4FinnishModel;
extern const SequenceModel Iso_8859_9FinnishModel;
extern const SequenceModel Iso_8859_13FinnishModel;
extern const SequenceModel Iso_8859_15FinnishModel;
extern const SequenceModel Windows_1252FinnishModel;
extern const SequenceModel Iso_8859_15FrenchModel;
extern const SequenceModel Iso_8859_1FrenchModel;
extern const SequenceModel Windows_1252FrenchModel;
extern const SequenceModel Iso_8859_15IrishModel;
extern const SequenceModel Iso_8859_1IrishModel;
extern const SequenceModel Iso_8859_9IrishModel;
extern const SequenceModel Windows_1252IrishModel;
extern const SequenceModel Iso_8859_8HebrewModel;
extern const SequenceModel Windows_1255HebrewModel;
extern const SequenceModel Ibm862HebrewModel;
extern const SequenceModel Iso_8859_2CroatianModel;
extern const SequenceModel Iso_8859_13CroatianModel;
extern const SequenceModel Iso_8859_16CroatianModel;
extern const SequenceModel Windows_1250CroatianModel;
extern const SequenceModel Ibm852CroatianModel;
extern const SequenceModel Mac_CentraleuropeCroatianModel;
extern const SequenceModel Iso_8859_2HungarianModel;
extern const SequenceModel Windows_1250HungarianModel;
extern const SequenceModel Iso_8859_1ItalianModel;
extern const SequenceModel Iso_8859_3ItalianModel;
extern const SequenceModel Iso_8859_9ItalianModel;
extern const SequenceModel Iso_8859_15ItalianModel;
extern const SequenceModel Windows_1252ItalianModel;
extern const SequenceModel Iso_8859_4LithuanianModel;
extern const SequenceModel Iso_8859_10LithuanianModel;
extern const SequenceModel Iso_8859_13LithuanianModel;
extern const SequenceModel Iso_8859_4LatvianModel;
extern const SequenceModel Iso_8859_10LatvianModel;
extern const SequenceModel Iso_8859_13LatvianModel;
extern const SequenceModel Windows_1251MacedonianModel;
extern const SequenceModel Ibm855MacedonianModel;
extern const SequenceModel Iso_8859_5MacedonianModel;
extern const SequenceModel Iso_8859_3MalteseModel;
extern const SequenceModel Ibm865NorwegianModel;
extern const SequenceModel Iso_8859_15NorwegianModel;
extern const SequenceModel Iso_8859_1NorwegianModel;
extern const SequenceModel Windows_1252NorwegianModel;
extern const SequenceModel Iso_8859_2PolishModel;
extern const SequenceModel Iso_8859_13PolishModel;
extern const SequenceModel Iso_8859_16PolishModel;
extern const SequenceModel Windows_1250PolishModel;
extern const SequenceModel Ibm852PolishModel;
extern const SequenceModel Mac_CentraleuropePolishModel;
extern const SequenceModel Iso_8859_15PortugueseModel;
extern const SequenceModel Iso_8859_1PortugueseModel;
extern const SequenceModel Windows_1252PortugueseModel;
extern const SequenceModel Iso_8859_9PortugueseModel;
extern const SequenceModel Iso_8859_2RomanianModel;
extern const SequenceModel Iso_8859_16RomanianModel;
extern const SequenceModel Windows_1250RomanianModel;
extern const SequenceModel Ibm852RomanianModel;
extern const SequenceModel Windows_1251RussianModel;
extern const SequenceModel Iso_8859_5RussianModel;
extern const SequenceModel Koi8_RRussianModel;
extern const SequenceModel Ibm855RussianModel;
extern const SequenceModel Ibm866RussianModel;
extern const SequenceModel Mac_CyrillicRussianModel;
extern const SequenceModel Iso_8859_2SlovakModel;
extern const SequenceModel Windows_1250SlovakModel;
extern const SequenceModel Ibm852SlovakModel;
extern const SequenceModel Mac_CentraleuropeSlovakModel;
extern const SequenceModel Iso_8859_2SloveneModel;
extern const SequenceModel Iso_8859_16SloveneModel;
extern const SequenceModel Windows_1250SloveneModel;
extern const SequenceModel Ibm852SloveneModel;
extern const SequenceModel Mac_CentraleuropeSloveneModel;
extern const SequenceModel Windows_1251SerbianModel;
extern const SequenceModel Iso_8859_5SerbianModel;
extern const SequenceModel Iso_8859_1SwedishModel;
extern const SequenceModel Iso_8859_4SwedishModel;
extern const SequenceModel Iso_8859_9SwedishModel;
extern const SequenceModel Iso_8859_15SwedishModel;
extern const SequenceModel Windows_1252SwedishModel;
extern const SequenceModel Iso_8859_11ThaiModel;
extern const SequenceModel Tis_620ThaiModel;
extern const SequenceModel Iso_8859_3TurkishModel;
extern const SequenceModel Iso_8859_9TurkishModel;
extern const SequenceModel Windows_1251UkrainianModel;
extern const SequenceModel Windows_1258VietnameseModel;
extern const SequenceModel VisciiVietnameseModel;
#endif /* nsSingleByteCharSetProber_generated_h__ */

View File

@ -131,154 +131,4 @@ protected:
};
extern const SequenceModel Windows_1256ArabicModel;
extern const SequenceModel Iso_8859_6ArabicModel;
extern const SequenceModel Koi8_RRussianModel;
extern const SequenceModel Windows_1251RussianModel;
extern const SequenceModel Iso_8859_5RussianModel;
extern const SequenceModel Mac_CyrillicRussianModel;
extern const SequenceModel Ibm866RussianModel;
extern const SequenceModel Ibm855RussianModel;
extern const SequenceModel Iso_8859_7GreekModel;
extern const SequenceModel Windows_1253GreekModel;
extern const SequenceModel Iso_8859_5BelarusianModel;
extern const SequenceModel Windows_1251BelarusianModel;
extern const SequenceModel Iso_8859_5BulgarianModel;
extern const SequenceModel Windows_1251BulgarianModel;
extern const SequenceModel Iso_8859_2HungarianModel;
extern const SequenceModel Windows_1250HungarianModel;
extern const SequenceModel Windows_1255HebrewModel;
extern const SequenceModel Ibm862HebrewModel;
extern const SequenceModel Tis_620ThaiModel;
extern const SequenceModel Iso_8859_11ThaiModel;
extern const SequenceModel Iso_8859_15FrenchModel;
extern const SequenceModel Iso_8859_1FrenchModel;
extern const SequenceModel Windows_1252FrenchModel;
extern const SequenceModel Iso_8859_15SpanishModel;
extern const SequenceModel Iso_8859_1SpanishModel;
extern const SequenceModel Windows_1252SpanishModel;
extern const SequenceModel Iso_8859_1GermanModel;
extern const SequenceModel Windows_1252GermanModel;
extern const SequenceModel Iso_8859_3EsperantoModel;
extern const SequenceModel Iso_8859_3TurkishModel;
extern const SequenceModel Iso_8859_9TurkishModel;
extern const SequenceModel VisciiVietnameseModel;
extern const SequenceModel Windows_1258VietnameseModel;
extern const SequenceModel Iso_8859_15DanishModel;
extern const SequenceModel Iso_8859_1DanishModel;
extern const SequenceModel Windows_1252DanishModel;
extern const SequenceModel Ibm865DanishModel;
extern const SequenceModel Iso_8859_1EnglishModel;
extern const SequenceModel Windows_1252EnglishModel;
extern const SequenceModel Iso_8859_13LithuanianModel;
extern const SequenceModel Iso_8859_10LithuanianModel;
extern const SequenceModel Iso_8859_4LithuanianModel;
extern const SequenceModel Iso_8859_13LatvianModel;
extern const SequenceModel Iso_8859_10LatvianModel;
extern const SequenceModel Iso_8859_4LatvianModel;
extern const SequenceModel Iso_8859_1PortugueseModel;
extern const SequenceModel Iso_8859_9PortugueseModel;
extern const SequenceModel Iso_8859_15PortugueseModel;
extern const SequenceModel Windows_1252PortugueseModel;
extern const SequenceModel Iso_8859_3MalteseModel;
extern const SequenceModel Windows_1250CzechModel;
extern const SequenceModel Iso_8859_2CzechModel;
extern const SequenceModel Ibm852CzechModel;
extern const SequenceModel Mac_CentraleuropeCzechModel;
extern const SequenceModel Windows_1250SlovakModel;
extern const SequenceModel Iso_8859_2SlovakModel;
extern const SequenceModel Ibm852SlovakModel;
extern const SequenceModel Mac_CentraleuropeSlovakModel;
extern const SequenceModel Windows_1250PolishModel;
extern const SequenceModel Iso_8859_2PolishModel;
extern const SequenceModel Iso_8859_13PolishModel;
extern const SequenceModel Iso_8859_16PolishModel;
extern const SequenceModel Ibm852PolishModel;
extern const SequenceModel Mac_CentraleuropePolishModel;
extern const SequenceModel Iso_8859_1FinnishModel;
extern const SequenceModel Iso_8859_4FinnishModel;
extern const SequenceModel Iso_8859_9FinnishModel;
extern const SequenceModel Iso_8859_13FinnishModel;
extern const SequenceModel Iso_8859_15FinnishModel;
extern const SequenceModel Windows_1252FinnishModel;
extern const SequenceModel Iso_8859_1ItalianModel;
extern const SequenceModel Iso_8859_3ItalianModel;
extern const SequenceModel Iso_8859_9ItalianModel;
extern const SequenceModel Iso_8859_15ItalianModel;
extern const SequenceModel Windows_1252ItalianModel;
extern const SequenceModel Windows_1250CroatianModel;
extern const SequenceModel Iso_8859_2CroatianModel;
extern const SequenceModel Iso_8859_13CroatianModel;
extern const SequenceModel Iso_8859_16CroatianModel;
extern const SequenceModel Ibm852CroatianModel;
extern const SequenceModel Mac_CentraleuropeCroatianModel;
extern const SequenceModel Windows_1252EstonianModel;
extern const SequenceModel Windows_1257EstonianModel;
extern const SequenceModel Iso_8859_4EstonianModel;
extern const SequenceModel Iso_8859_13EstonianModel;
extern const SequenceModel Iso_8859_15EstonianModel;
extern const SequenceModel Iso_8859_15IrishModel;
extern const SequenceModel Iso_8859_9IrishModel;
extern const SequenceModel Iso_8859_1IrishModel;
extern const SequenceModel Windows_1252IrishModel;
extern const SequenceModel Windows_1250RomanianModel;
extern const SequenceModel Iso_8859_2RomanianModel;
extern const SequenceModel Iso_8859_16RomanianModel;
extern const SequenceModel Ibm852RomanianModel;
extern const SequenceModel Windows_1250SloveneModel;
extern const SequenceModel Iso_8859_2SloveneModel;
extern const SequenceModel Iso_8859_16SloveneModel;
extern const SequenceModel Ibm852SloveneModel;
extern const SequenceModel Mac_CentraleuropeSloveneModel;
extern const SequenceModel Iso_8859_1SwedishModel;
extern const SequenceModel Iso_8859_4SwedishModel;
extern const SequenceModel Iso_8859_9SwedishModel;
extern const SequenceModel Iso_8859_15SwedishModel;
extern const SequenceModel Windows_1252SwedishModel;
extern const SequenceModel Iso_8859_15NorwegianModel;
extern const SequenceModel Iso_8859_1NorwegianModel;
extern const SequenceModel Windows_1252NorwegianModel;
extern const SequenceModel Ibm865NorwegianModel;
extern const SequenceModel Windows_1251UkrainianModel;
extern const SequenceModel Windows_1251SerbianModel;
extern const SequenceModel Iso_8859_5SerbianModel;
extern const SequenceModel Windows_1251MacedonianModel;
extern const SequenceModel Ibm855MacedonianModel;
extern const SequenceModel Iso_8859_5MacedonianModel;
#endif /* nsSingleByteCharSetProber_h__ */