#!/bin/python3 # -*- coding: utf-8 -*- # ##### BEGIN LICENSE BLOCK ##### # Version: MPL 1.1/GPL 2.0/LGPL 2.1 # # The contents of this file are subject to the Mozilla Public License Version # 1.1 (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # http://www.mozilla.org/MPL/ # # Software distributed under the License is distributed on an "AS IS" basis, # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License # for the specific language governing rights and limitations under the # License. # # The Original Code is Mozilla Universal charset detector code. # # The Initial Developer of the Original Code is # Netscape Communications Corporation. # Portions created by the Initial Developer are Copyright (C) 2001 # the Initial Developer. All Rights Reserved. # # Contributor(s): # Jehan # # Alternatively, the contents of this file may be used under the terms of # either the GNU General Public License Version 2 or later (the "GPL"), or # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), # in which case the provisions of the GPL or the LGPL are applicable instead # of those above. If you wish to allow use of your version of this file only # under the terms of either the GPL or the LGPL, and not to allow others to # use your version of this file under the terms of the MPL, indicate your # decision by deleting the provisions above and replace them with the notice # and other provisions required by the GPL or the LGPL. If you do not delete # the provisions above, a recipient may use your version of this file under # the terms of any one of the MPL, the GPL or the LGPL. # # ##### END LICENSE BLOCK ##### import re ## Mandatory Properties ## name = 'Hindi' aliases = [ 'Devanagari' ] code = 'hi' use_ascii = False # ISCII is a collection of single byte encodings (10 variants). # Unfortunately it looks like neither iconv nor python (probably based off # iconv?) know any of the ISCII encodings. Therefore I cannot build any ISCII # data, and as a consequence process it. # See: # http://stackoverflow.com/questions/27143365/unicode-to-iscii-conversion # https://en.wikipedia.org/wiki/Indian_Script_Code_for_Information_Interchange # Anyway according to Wikipedia, these encodings don't seem much used. UTF-8 is # mostly used in India, it would appear. #charsets = ['ISCII'] charsets = [] ## Optional Properties ## # Devanagari script, see: # https://en.wikipedia.org/wiki/Devanagari # 11 vowels and 33 consonants with independent and diacritic forms, etc. # To keep it simple, I don't list the alphabet and leave statistics work # its magics. unicode_ranges = [(0x900, 0x97F), # Devanagari (0xA8E0, 0xA8FF), # Devanagari Extended (0x1CD0, 0x1CFF), # Vedic Extensions ] start_pages = ['मुखपृष्ठ'] wikipedia_code = code case_mapping = False