uchardet/script/BuildLangModel.py

#!/bin/python3
# -*- coding: utf-8 -*-

# ##### BEGIN LICENSE BLOCK #####
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
#          Jehan <jehan@girinstud.io>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ##### END LICENSE BLOCK #####

# Third party modules.
import unicodedata
import subprocess
import wikipedia
import importlib
import math
import optparse
import datetime
import operator
import requests
import sys
import re
import os
import random

# Custom modules.
import charsets.db
from charsets.codepoints import *

# Command line processing.
usage = 'Usage: {} <LANG-CODE>\n' \
        '\nEx: `{} fr`'.format(__file__, __file__)

description = "Internal tool for uchardet to generate language data."
cmdline = optparse.OptionParser(usage, description = description)
cmdline.add_option('--max-page',
                   help = 'Maximum number of Wikipedia pages to parse (useful for debugging).',
                   action = 'store', type = 'int', dest = 'max_page', default = None)
cmdline.add_option('--max-depth',
                   help = 'Maximum depth when following links from start page (default: 2).',
                   action = 'store', type = 'int',
                   dest = 'max_depth', default = 2)
(options, langs) = cmdline.parse_args()
if len(langs) < 1:
  sys.stderr.write("Please select at least one language code. ")
  sys.stderr.write("You may also choose 'all' or 'none'.\n")
  exit(1)

current_dir = os.path.dirname(os.path.realpath(__file__))

with open(os.path.join(current_dir, "support.txt")) as f:
    all_langs = f.readlines()
all_langs = [ l.strip() for l in all_langs if l.strip() != '' ]

if len(langs) == 1:
  if langs[0].lower() == 'none':
    langs = []
  elif langs[0].lower() == 'all':
    langs = all_langs

abort = False
for lang in langs:
  if lang not in all_langs:
    abort = True
    sys.stderr.write("Error: unsupported lang: {}\n".format(lang))
if abort:
  sys.stderr.write("Info: new langs must be added in 'script/support.txt'.\n")
  exit(1)

generated_files = []

for lang_arg in langs:
  lang_arg = lang_arg.lower()

  # Load the language data.
  sys_path_backup = sys.path
  sys.path = [current_dir + '/langs']
  try:
      lang = importlib.import_module(lang_arg)
  except ImportError:
      sys.stderr.write('Unknown language code "{}": '
                       'file "langs/{}.py" does not exist.'.format(lang_arg, lang_arg))
      exit(1)
  sys.path = sys_path_backup

  print("Processing language data for {} (lang/{}.py):\n".format(lang_arg, lang_arg))

  lang_charsets = charsets.db.load(lang.charsets)

  if not hasattr(lang, 'start_pages') or lang.start_pages is None or \
     lang.start_pages == []:
      # Let's start with the main page, assuming it should have links
      # to relevant pages. In locale wikipedia, this page is usually redirected
      # to a relevant page.
      sys.stderr.write("Warning: no `start_pages` set for '{}'. Using ['Main_Page'].\n"
                       "         If you don't get good data, it is advised to set a "
                       "start_pages` variable yourself.".format(lang.code))
      lang.start_pages = ['Main_Page']
  if not hasattr(lang, 'wikipedia_code') or lang.wikipedia_code is None:
      lang.wikipedia_code = lang.code
  if not hasattr(lang, 'clean_wikipedia_content') or lang.clean_wikipedia_content is None:
      lang.clean_wikipedia_content = None
  if hasattr(lang, 'case_mapping'):
      lang.case_mapping = bool(lang.case_mapping)
  else:
      lang.case_mapping = False
  if not hasattr(lang, 'custom_case_mapping'):
      lang.custom_case_mapping = None
  if not hasattr(lang, 'alphabet') or lang.alphabet is None:
      lang.alphabet = None
  if not hasattr(lang, 'alphabet_mapping') or lang.alphabet_mapping is None:
      lang.alphabet_mapping = None
  if not hasattr(lang, 'unicode_ranges') or lang.unicode_ranges is None:
      lang.unicode_ranges = None
  if not hasattr(lang, 'frequent_ranges') or lang.frequent_ranges is None:
      if lang.unicode_ranges is not None:
        lang.frequent_ranges = lang.unicode_ranges
      else:
        lang.frequent_ranges = None

  def local_lowercase(text, lang):
      lowercased = ''
      for l in text:
          if lang.custom_case_mapping is not None and \
             l in lang.custom_case_mapping:
              lowercased += lang.custom_case_mapping[l]
          elif l.isupper() and \
               lang.case_mapping and \
               len(unicodedata.normalize('NFC', l.lower())) == 1:
              lowercased += l.lower()
          else:
              lowercased += l
      return lowercased

  if lang.use_ascii:
      if lang.alphabet is None:
          lang.alphabet = [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
      else:
          # Allowing to provide an alphabet in string format rather than list.
          lang.alphabet = list(lang.alphabet)
          lang.alphabet += [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
  if lang.alphabet is not None:
      # Allowing to provide an alphabet in string format rather than list.
      lang.alphabet = list(lang.alphabet)
      if lang.case_mapping or lang.custom_case_mapping is not None:
          lang.alphabet = [local_lowercase(l, lang) for l in lang.alphabet]
          #alphabet = []
          #for l in lang.alphabet:
              #if l.isupper() and \
                 #lang.custom_case_mapping is not None and \
                 #l in lang.custom_case_mapping:
                  #alphabet.append(lang.custom_case_mapping[l])
              #elif l.isupper() and \
                   #lang.case_mapping and \
                   #len(unicodedata.normalize('NFC', l.lower())) == 1:
                  #alphabet.append(l.lower())
              #else:
                  #alphabet.append(l)
      lang.alphabet = list(set(lang.alphabet))

  if lang.alphabet_mapping is not None:
      alphabet_mapping = {}
      for char in lang.alphabet_mapping:
        # Allowing to provide an alphabet in string format rather than list.
        for alt_char in list(lang.alphabet_mapping[char]):
          # While it's easier to write from main character to
          # equivalencies in the language file, we reverse the mapping
          # for simpler usage.
          if lang.case_mapping or lang.custom_case_mapping is not None:
            alphabet_mapping[alt_char] = local_lowercase(char, lang)
          else:
            alphabet_mapping[alt_char] = char
      lang.alphabet_mapping = alphabet_mapping

  def normalize_codepoint_ranges(input_range):
    output_range = []
    if input_range is not None:
        for start, end in input_range:
          # Allow to write down characters rather than unicode values.
          if isinstance(start, str):
            start = ord(start)
          if isinstance(end, str):
            end = ord(end)
          if not isinstance(start, int) or not isinstance(end, int):
            sys.stderr.write("Expected unicode range in char or int: {}-{}.\n".format(start, end))
          if start > end:
            sys.stderr.write("Wrong unicode range: {}-{}.\n".format(start, end))
          else:
            output_range += [(start, end)]
    if len(output_range) == 0:
      output_range = None
    return output_range

  lang.unicode_ranges = normalize_codepoint_ranges(lang.unicode_ranges)
  lang.frequent_ranges = normalize_codepoint_ranges(lang.frequent_ranges)

  # Starting processing.
  wikipedia.set_lang(lang.wikipedia_code)

  visited_pages = []

  # The full list of letter characters.
  # The key is the unicode codepoint,
  # and the value is the occurrence count.
  characters = {}
  # Sequence of letters.
  # The key is the couple (char1, char2) in unicode codepoint,
  # the value is the occurrence count.
  sequences = {}
  prev_char = None

  def process_text(content, lang):
      global lang_charsets
      global characters
      global sequences
      global prev_char

      if lang.clean_wikipedia_content is not None:
          content = lang.clean_wikipedia_content(content)
      # Clean out the Wikipedia syntax for titles.
      content = re.sub(r'(=+) *([^=]+) *\1',
                       r'\2', content)
      # Clean multiple spaces. Newlines and such are normalized to spaces,
      # since they have basically a similar role in the purpose of uchardet.
      content = re.sub(r'\s+', ' ', content)

      if lang.case_mapping or lang.custom_case_mapping is not None:
          content = local_lowercase(content, lang)

      # In python 3, strings are UTF-8.
      # Looping through them return expected characters.
      for char in content:
          # Map to main equivalent character.
          if lang.alphabet_mapping is not None and \
             char in lang.alphabet_mapping:
            char = lang.alphabet_mapping[char]

          unicode_value = ord(char)
          is_letter = False
          if unicode_value in characters:
              characters[unicode_value] += 1
              is_letter = True
          elif lang.unicode_ranges is not None:
              for start, end in lang.unicode_ranges:
                if unicode_value >= start and unicode_value <= end:
                  characters[unicode_value] = 1
                  is_letter = True
                  break
          else:
              # We save the character if it is at least in one of the
              # language encodings and its not a special character.
              for charset in lang_charsets:
                  # Does the character exist in the charset?
                  try:
                      codepoint = char.encode(charset, 'ignore')
                  except LookupError:
                      # unknown encoding. Use iconv from command line instead.
                      try:
                          call = subprocess.Popen(['iconv', '-f', 'UTF-8', '-t', charset],
                                                  stdin=subprocess.PIPE, stdout=subprocess.PIPE,
                                                  stderr=subprocess.DEVNULL)
                          if call.poll() is not None:
                              (_, error) = call.communicate(input='')
                              sys.stderr.write('Error: `iconv` ended with error "{}".\n'.format(error))
                              exit(1)
                          (codepoint, _) = call.communicate(input=char.encode('UTF-8'))
                      except FileNotFoundError:
                          sys.stderr.write('Error: "{}" is not a supported charset by python and `iconv` is not installed.\n')
                          exit(1)

                  if codepoint == b'':
                      continue
                  # ord() is said to return the unicode codepoint.
                  # But it turns out it also gives the codepoint for other
                  # charsets if I turn the string to encoded bytes first.
                  # Not sure if that is a bug or expected.
                  codepoint = ord(codepoint)
                  if lang_charsets[charset].charmap[codepoint] == LET:
                      characters[unicode_value] = 1
                      is_letter = True
                      break
          if is_letter:
              if prev_char is not None:
                  if (prev_char, unicode_value) in sequences:
                      sequences[(prev_char, unicode_value)] += 1
                  else:
                      sequences[(prev_char, unicode_value)] = 1
              prev_char = unicode_value
          else:
              prev_char = None

  def visit_pages(titles, depth, lang, logfd):
      global visited_pages
      global options

      if len(titles) == 0:
          return

      next_titles = []
      if options.max_page is not None:
        max_titles = int(options.max_page/(options.max_depth * options.max_depth))
      else:
        max_titles = sys.maxsize
      for title in titles:
          if options.max_page is not None and \
             len(visited_pages) > options.max_page:
              return
          if title in visited_pages:
              continue

          # Ugly hack skipping internal pages
          if 'wiki' in title or 'Wiki' in title:
              sys.stderr.write('Skipping', title)
              continue

          visited_pages += [title]
          try:
              page = wikipedia.page(title, auto_suggest=False)
          except (wikipedia.exceptions.PageError,
                  wikipedia.exceptions.DisambiguationError) as error:
              # Let's just discard a page when I get an exception.
              sys.stderr.write("Discarding page {}: {}\n".format(title, error))
              continue
          logfd.write("\n{} (revision {})".format(title, page.revision_id))
          logfd.flush()

          process_text(page.content, lang)
          try:
            links = page.links
            random.shuffle(links)
            if len(links) > max_titles:
                links = links[:max_titles]
                next_titles += links
          except KeyError:
              pass

      if depth >= options.max_depth:
          return

      random.shuffle(next_titles)
      visit_pages (next_titles, depth + 1, lang, logfd)

  language_c = lang.name.replace('-', '_').title()
  build_log = current_dir + '/BuildLangModelLogs/Lang{}Model.log'.format(language_c)
  logfd = open(build_log, 'w')
  logfd.write('= Logs of language model for {} ({}) =\n'.format(lang.name, lang.code))
  logfd.write('\n- Generated by {}'.format(os.path.basename(__file__)))
  logfd.write('\n- Started: {}'.format(str(datetime.datetime.now())))
  logfd.write('\n- Maximum depth: {}'.format(options.max_depth))
  if options.max_page is not None:
      logfd.write('\n- Max number of pages: {}'.format(options.max_page))
  logfd.write('\n\n== Parsed pages ==\n')
  logfd.flush()
  try:
      visit_pages(lang.start_pages, 0, lang, logfd)
  except requests.exceptions.ConnectionError:
      sys.stderr.write('Error: connection to Wikipedia failed. Aborting\n')
      exit(1)
  logfd.write('\n\n== End of Parsed pages ==')
  logfd.write('\n\n- Wikipedia parsing ended at: {}\n'.format(str(datetime.datetime.now())))
  logfd.flush()

  ########### CHARACTERS ###########

  # Character ratios.
  ratios = {}
  n_char = len(characters)
  occurrences = sum(characters.values())

  logfd.write("\n{} characters appeared {} times.\n".format(n_char, occurrences))
  for char in characters:
      ratios[char] = characters[char] / occurrences
      #logfd.write("Character '{}' usage: {} ({} %)\n".format(chr(char),
      #                                                       characters[char],
      #                                                       ratios[char] * 100))

  sorted_ratios = sorted(ratios.items(), key=operator.itemgetter(1),
                         reverse=True)
  # Accumulated ratios of the frequent chars.
  accumulated_ratios = 0

  # If there is no alphabet defined, we just use the first 64 letters, which was
  # the original default.
  # If there is an alphabet, we make sure all the alphabet characters are in the
  # frequent list, and we stop then. There may therefore be more or less than
  # 64 frequent characters depending on the language.
  logfd.write('\nMost Frequent characters:')
  very_freq_count = 0
  very_freq_ratio = 0
  if lang.alphabet is None and lang.frequent_ranges is None:
      freq_count = min(64, len(sorted_ratios))
      for order, (char, ratio) in enumerate(sorted_ratios):
          if order >= freq_count:
              break
          logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
          accumulated_ratios += ratio
          if very_freq_ratio < 0.4:
            very_freq_count += 1
            very_freq_ratio += ratio
  elif lang.alphabet is not None:
      freq_count = 0
      for order, (char, ratio) in enumerate(sorted_ratios):
          if len(lang.alphabet) == 0:
              break
          if chr(char) in lang.alphabet:
              lang.alphabet.remove(chr(char))
          logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
          accumulated_ratios += ratio
          freq_count += 1
          if very_freq_ratio < 0.4:
            very_freq_count += 1
            very_freq_ratio += ratio
      else:
          if len(lang.alphabet) > 0:
              sys.stderr.write("Error: alphabet characters are absent from data collection"
                               "\n       Please check the configuration or the data."
                               "\n       Missing characters: {}".format(", ".join(lang.alphabet)))
              exit(1)
  elif lang.frequent_ranges is not None:
      # How many characters in the frequent range?
      frequent_ranges_size = 0
      for start, end in lang.frequent_ranges:
        frequent_ranges_size += end - start + 1

      # Keep ratio for at least all the characters inside the frequent
      # ranges.
      freq_count = 0
      for order, (char, ratio) in enumerate(sorted_ratios):
        for start, end in lang.frequent_ranges:
          if char >= start and char <= end:
            freq_count += 1
            accumulated_ratios += ratio
            logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
            frequent_ranges_size -= 1
            break
        else:
          # A frequent character in the non-frequent range.
          logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
          freq_count += 1
          accumulated_ratios += ratio

        if very_freq_ratio < 0.4:
          very_freq_count += 1
          very_freq_ratio += ratio

        if frequent_ranges_size <= 0:
          break

  low_freq_order = freq_count - 1
  low_freq_ratio = 0
  for back_order, (char, ratio) in enumerate(reversed(sorted_ratios[:freq_count])):
    if low_freq_ratio < 0.03:
      low_freq_ratio += ratio
      low_freq_order -= 1
    else:
      break

  logfd.write("\n\nThe first {} characters have an accumulated ratio of {}.\n".format(freq_count, accumulated_ratios))
  logfd.write("The first {} characters have an accumulated ratio of {}.\n".format(very_freq_count, very_freq_ratio))
  logfd.write("All characters whose order is over {} have an accumulated ratio of {}.\n".format(low_freq_order, low_freq_ratio))

  with open(current_dir + '/header-template.cpp', 'r') as header_fd:
      c_code = header_fd.read()

  c_code += '\n#include "../nsSBCharSetProber.h"'
  c_code += '\n#include "../nsSBCharSetProber-generated.h"'
  c_code += '\n#include "../nsLanguageDetector.h"\n'
  c_code += '\n#include "../nsLanguageDetector-generated.h"\n'
  c_code += '\n/********* Language model for: {} *********/\n\n'.format(lang.name)
  c_code += '/**\n * Generated by {}\n'.format(os.path.basename(__file__))
  c_code += ' * On: {}\n'.format(str(datetime.datetime.now()))
  c_code += ' **/\n'

  c_code += \
  """
  /* Character Mapping Table:
   * ILL: illegal character.
   * CTR: control character specific to the charset.
   * RET: carriage/return.
   * SYM: symbol (punctuation) that does not belong to word.
   * NUM: 0 - 9.
   *
   * Other characters are ordered by probabilities
   * (0 is the most common character in the language).
   *
   * Orders are generic to a language. So the codepoint with order X in
   * CHARSET1 maps to the same character as the codepoint with the same
   * order X in CHARSET2 for the same language.
   * As such, it is possible to get missing order. For instance the
   * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
   * even though they are both used for French. Same for the euro sign.
   */
  """

  for charset in lang_charsets:
      charset_c = charset.replace('-', '_').title()
      CTOM_str = 'static const unsigned char {}_CharToOrderMap[]'.format(charset_c)
      CTOM_str += ' =\n{'
      for line in range(0, 16):
          CTOM_str += '\n  '
          for column in range(0, 16):
              cp = line * 16 + column
              cp_type = lang_charsets[charset].charmap[cp]
              if cp_type == ILL:
                  CTOM_str += 'ILL,'
              elif cp_type == RET:
                  CTOM_str += 'RET,'
              elif cp_type == CTR:
                  CTOM_str += 'CTR,'
              elif cp_type == SYM:
                  CTOM_str += 'SYM,'
              elif cp_type == NUM:
                  CTOM_str += 'NUM,'
              else: # LET
                  try:
                      uchar = bytes([cp]).decode(charset)
                  except UnicodeDecodeError:
                      sys.stderr.write('Unknown character 0X{:X} in {}.'.format(cp, charset))
                      sys.stderr.write('Please verify your charset specification.\n')
                      exit(1)
                  except LookupError:
                      # Unknown encoding. Use iconv instead.
                      try:
                          call = subprocess.Popen(['iconv', '-t', 'UTF-8', '-f', charset],
                                                  stdin=subprocess.PIPE,
                                                  stdout=subprocess.PIPE,
                                                  stderr=subprocess.PIPE)
                          if call.poll() is not None:
                              (_, error) = call.communicate(input='')
                              sys.stderr.write('Error: `iconv` ended with error "{}".\n'.format(error))
                              exit(1)
                          (uchar, _) = call.communicate(input=bytes([cp]))
                          uchar = uchar.decode('UTF-8')
                      except FileNotFoundError:
                          sys.stderr.write('Error: "{}" is not a supported charset by python and `iconv` is not installed.\n')
                          exit(1)
                      if len(uchar) == 0:
                          sys.stderr.write('TypeError: iconv failed to return a unicode character for codepoint "{}" in charset {}.\n'.format(hex(cp), charset))
                          exit(1)
                  #if lang.case_mapping and uchar.isupper() and \
                     #len(unicodedata.normalize('NFC', uchar.lower())) == 1:
                     # Unless we encounter special cases of characters with no
                     # composed lowercase, we lowercase it.
                  if lang.case_mapping or lang.custom_case_mapping is not None:
                      uchar = local_lowercase(uchar, lang)
                  if lang.alphabet_mapping is not None and uchar in lang.alphabet_mapping:
                      uchar = lang.alphabet_mapping[uchar]
                  for order, (char, ratio) in enumerate(sorted_ratios):
                      if char == ord(uchar):
                          CTOM_str += '{:3},'.format(min(249, order))
                          break
                  else:
                      # XXX: we must make sure the character order does not go
                      # over the special characters (250 currently). This may
                      # actually happen when building a model for a language
                      # writable with many different encoding. So let's just
                      # ceil the order value at 249 max.
                      # It may be an interesting alternative to add another
                      # constant for any character with an order > freqCharCount.
                      # Maybe IRR (irrelevant character) or simply CHR.
                      CTOM_str += '{:3},'.format(min(249, n_char))
                      n_char += 1
          CTOM_str += ' /* {:X}X */'.format(line)
      CTOM_str += '\n};\n/*'
      CTOM_str += 'X0  X1  X2  X3  X4  X5  X6  X7  X8  X9  XA  XB  XC  XD  XE  XF'
      CTOM_str += ' */\n\n'
      c_code += CTOM_str

  ## UNICODE frequency.

  # Since we can't map the full character table from encoding to order,
  # just create a list from the most common characters from the language.
  # The list is ordered by unicode code points (hence can be used
  # generically for various encoding scheme as it is not encoding
  # specific) allowing to search from code points efficiently by a divide
  # and conqueer search algorithm.
  # Each code point is immediately followed by its order.

  # Keep the freq_count more frequent characters.
  sorted_chars = [(char, freq, order) for order, (char, freq) in
                  enumerate(sorted_ratios)][:freq_count]
  max_order = len(sorted_chars)

  # Add equivalency characters.
  equivalent = []
  if lang.case_mapping:
      for char, ratio, order in sorted_chars:
          uppercased = chr(char).upper()
          try:
            if char != ord(uppercased):
                equivalent += [(ord(uppercased), ratio, order)]
          except TypeError:
            # This happens for some case such as 'SS' as uppercase of 'ß'.
            # Just ignore such cases.
            sys.stderr.write("Ignoring '{}' as uppercase equivalent of '{}'.\n".format(uppercased, char))

  if lang.alphabet_mapping is not None:
    for alt_c in lang.alphabet_mapping:
      for char, ratio, order in sorted_chars:
        if alt_c == chr(char):
          sys.stderr.write("ALREADY {}\n".format(alt_c))
          exit(1)
        elif char == ord(lang.alphabet_mapping[alt_c]):
          equivalent += [(ord(alt_c), ratio, order)]
          break
      else:
        sys.stderr.write("Base equivalent for {} not found in frequent characters!\n".format(alt_c))
        exit(1)

  sorted_chars += equivalent

  # Order by code point.
  sorted_chars = sorted(sorted_chars, key=operator.itemgetter(0))

  CTOM_str = 'static const int Unicode_Char_size = {};\n'.format(len(sorted_chars))

  CTOM_str += 'static const unsigned int Unicode_CharOrder[]'
  CTOM_str += ' =\n{'
  column = 0

  max_char_width  = math.floor(math.log10(sorted_chars[-1][0])) + 1
  max_order_width = math.floor(math.log10(max_order)) + 1

  for char, ratio, order in sorted_chars:
      if column % 8 == 0:
          CTOM_str += '\n '
      column += 1
      CTOM_str += '{}{:>{width}}, '.format('' if column % 8 == 0 else ' ', char, width=max_char_width)
      CTOM_str += '{:>{width}},'.format(order, width=max_order_width)

  CTOM_str += '\n};\n\n'
  c_code += CTOM_str

  ########### SEQUENCES ###########

  ratios = {}
  occurrences = sum(sequences.values())

  accumulated_seq_count = 0
  order_3 = -1
  order_2 = -1
  ratio_3 = -1
  ratio_2 = -1
  count_512 = -1
  count_1024 = -1
  sorted_seqs = sorted(sequences.items(), key=operator.itemgetter(1),
                       reverse=True)
  for order, ((c1, c2), count) in enumerate(sorted_seqs):
    accumulated_seq_count += count
    if order_3 == -1 and accumulated_seq_count / occurrences >= 0.995:
      order_3 = order
      ratio_3 = accumulated_seq_count / occurrences
    elif order_2 == -1 and accumulated_seq_count / occurrences >= 0.999:
      order_2 = order
      ratio_2 = accumulated_seq_count / occurrences
    if order < 512:
      count_512 += count
    elif order < 1024:
      count_1024 += count

    if order_3 != -1 and order_2 != -1:
      break

  if order_3 == -1 or order_2 == -1:
    # This would probably never happens. It would require a language with
    # very few possible sequences and each of the sequences are widely
    # used. Just add this code for completio, but it won't likely ever be
    # run.
    order_2 = 512
    order_3 = 1024
    ratio_2 = count_512 / occurrences
    ratio_3 = count_1024 / occurrences

  logfd.write("\n{} sequences found.\n".format(len(sorted_seqs)))

  c_code += """
  /* Model Table:
   * Total considered sequences: {} / {}
   * - Positive sequences: first {} ({})
   * - Probable sequences: next {} ({}-{}) ({})
   * - Neutral sequences: last {} ({})
   * - Negative sequences: {} (off-ratio)
   * Negative sequences: TODO""".format(len(sorted_seqs),
                                        freq_count * freq_count,
                                        order_3, ratio_3,
                                        order_2 - order_3,
                                        order_2, order_3,
                                        ratio_2 - ratio_3,
                                        freq_count * freq_count - order_2,
                                        1 - ratio_2,
                                        freq_count * freq_count - len(sorted_seqs))

  logfd.write("\nFirst {} (typical positive ratio): {}".format(order_3, ratio_3))
  logfd.write("\nNext {} ({}-{}): {}".format(order_2 - order_3,
                                             order_2, order_3,
                                             ratio_2 - ratio_3))
  logfd.write("\nRest: {}".format(1 - ratio_2))

  c_code += "\n */\n"

  LM_str = 'static const PRUint8 {}LangModel[]'.format(language_c)
  LM_str += ' =\n{'
  for line in range(0, freq_count):
      LM_str += '\n  '
      for column in range(0, freq_count):
          # Let's not make too long lines.
          if freq_count > 40 and column == int(freq_count / 2):
              LM_str += '\n   '
          first_order = int(line)
          second_order = column
          if first_order < len(sorted_ratios) and second_order < len(sorted_ratios):
              (first_char, _) = sorted_ratios[first_order]
              (second_char, _) = sorted_ratios[second_order]
              if (first_char, second_char) in sequences:
                  for order, (seq, _) in enumerate(sorted_seqs):
                      if seq == (first_char, second_char):
                          if order < order_3:
                              LM_str += '3,'
                          elif order < order_2:
                              LM_str += '2,'
                          else:
                              LM_str += '1,'
                          break
                  else:
                      pass # impossible!
                      LM_str += '0,'
              else:
                  LM_str += '0,'
          else:
              # It may indeed happen that we find less than 64 letters used for a
              # given language.
              LM_str += '0,'
  LM_str += '\n};\n'
  c_code += LM_str

  for charset in lang_charsets:
      charset_c = charset.replace('-', '_').title()
      SM_str = '\n\nconst SequenceModel {}{}Model ='.format(charset_c, language_c)
      SM_str += '\n{\n  '
      SM_str += '{}_CharToOrderMap,\n  {}LangModel,'.format(charset_c, language_c)
      SM_str += '\n  {},'.format(freq_count)
      SM_str += '\n  (float){},'.format(ratio_2)
      SM_str += '\n  {},'.format('PR_TRUE' if lang.use_ascii else 'PR_FALSE')
      SM_str += '\n  "{}",'.format(charset)
      SM_str += '\n  "{}"'.format(lang.code)
      SM_str += '\n};'
      c_code += SM_str

  SM_str = '\n\nconst LanguageModel {}Model ='.format(language_c)
  SM_str += '\n{'
  SM_str += '\n  "{}",'.format(lang.code)
  SM_str += '\n  Unicode_CharOrder,'
  SM_str += '\n  {},'.format(len(sorted_chars)) # Order is wrong!
  SM_str += '\n  {}LangModel,'.format(language_c)
  SM_str += '\n  {},'.format(freq_count)
  SM_str += '\n  {},'.format(very_freq_count)
  SM_str += '\n  (float){},'.format(very_freq_ratio)
  SM_str += '\n  {},'.format(low_freq_order)
  SM_str += '\n  (float){},'.format(low_freq_ratio)
  SM_str += '\n};'
  c_code += SM_str

  c_code += '\n'

  lang_model_file = current_dir + '/../src/LangModels/Lang{}Model.cpp'.format(language_c)
  with open(lang_model_file, 'w') as cpp_fd:
      cpp_fd.write(c_code)

  logfd.write('\n\n- Processing end: {}\n'.format(str(datetime.datetime.now())))
  logfd.close()

  generated_files += [ (lang_model_file, build_log) ]

charset_cpp = os.path.join(current_dir, '../src', 'nsSBCharSetProber-generated.h')
print("\nGenerating {}…".format(charset_cpp))

with open(charset_cpp, 'w') as cpp_fd:
  with open(current_dir + '/header-template.cpp', 'r') as header_fd:
    cpp_fd.write(header_fd.read())

  cpp_fd.write('\n#ifndef nsSingleByteCharSetProber_generated_h__')
  cpp_fd.write('\n#define nsSingleByteCharSetProber_generated_h__\n')

  all_extern_declarations = ''
  n_sequence_models = 0
  for l in all_langs:
    l = l.lower()
    # Load the language data.
    sys_path_backup = sys.path
    sys.path = [current_dir + '/langs']
    try:
        lang = importlib.import_module(l)
    except ImportError:
        sys.stderr.write('Unknown language code "{}": '
                         'file "langs/{}.py" does not exist.'.format(l, l))
        exit(1)
    sys.path = sys_path_backup

    language_c = lang.name.replace('-', '_').title()
    lang_charsets = charsets.db.load(lang.charsets)
    for charset in lang_charsets:
      charset_c = charset.replace('-', '_').title()
      all_extern_declarations += '\nextern const SequenceModel {}{}Model;'.format(charset_c, language_c)
      n_sequence_models += 1
    all_extern_declarations += '\n'

  cpp_fd.write('\n#define NUM_OF_SEQUENCE_MODELS {}\n'.format(n_sequence_models))
  cpp_fd.write('{}'.format(all_extern_declarations))
  cpp_fd.write('\n#endif /* nsSingleByteCharSetProber_generated_h__ */')

print("Done!")

language_cpp = os.path.join(current_dir, '../src', 'nsLanguageDetector-generated.h')
print("\nGenerating {}…".format(language_cpp))

with open(language_cpp, 'w') as cpp_fd:
  with open(current_dir + '/header-template.cpp', 'r') as header_fd:
    cpp_fd.write(header_fd.read())

  cpp_fd.write('\n#ifndef nsLanguageDetector_h_generated_h__')
  cpp_fd.write('\n#define nsLanguageDetector_h_generated_h__\n')

  all_extern_declarations = ''
  n_language_models = 0
  for l in all_langs:
    l = l.lower()
    # Load the language data.
    sys_path_backup = sys.path
    sys.path = [current_dir + '/langs']
    try:
        lang = importlib.import_module(l)
    except ImportError:
        sys.stderr.write('Unknown language code "{}": '
                         'file "langs/{}.py" does not exist.'.format(l, l))
        exit(1)
    sys.path = sys_path_backup

    language_c = lang.name.replace('-', '_').title()
    all_extern_declarations += '\nextern const LanguageModel {}Model;'.format(language_c)
    n_language_models += 1

  cpp_fd.write('\n#define NUM_OF_LANGUAGE_MODELS {}\n'.format(n_language_models))
  cpp_fd.write('{}'.format(all_extern_declarations))
  cpp_fd.write('\n\n#endif /* nsLanguageDetector_h_generated_h__ */')

print("Done!")
if len(generated_files) > 0:
  print("\nThe following language files has been generated:")
  for (lang_model_file, build_log) in generated_files:
    print("\n- Language file: {}".format(lang_model_file))
    print("\n  Build log: {}".format(build_log))

print("\nTODO:")
print("- edit nsSBCSGroupProber::nsSBCSGroupProber() in src/nsSBCSGroupProber.cpp manually to test new sequence models;")
print("- edit nsMBCSGroupProber::nsMBCSGroupProber() in src/nsMBCSGroupProber.cpp manually to test new language models;")
print("- add any new language files to src/CMakeLists.txt;")
print("- commit generated files if tests are successful.")