#!/bin/python3 # -*- coding: utf-8 -*- # ##### BEGIN LICENSE BLOCK ##### # Version: MPL 1.1/GPL 2.0/LGPL 2.1 # # The contents of this file are subject to the Mozilla Public License Version # 1.1 (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # http://www.mozilla.org/MPL/ # # Software distributed under the License is distributed on an "AS IS" basis, # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License # for the specific language governing rights and limitations under the # License. # # The Original Code is Mozilla Universal charset detector code. # # The Initial Developer of the Original Code is # Netscape Communications Corporation. # Portions created by the Initial Developer are Copyright (C) 2001 # the Initial Developer. All Rights Reserved. # # Contributor(s): # Jehan # # Alternatively, the contents of this file may be used under the terms of # either the GNU General Public License Version 2 or later (the "GPL"), or # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), # in which case the provisions of the GPL or the LGPL are applicable instead # of those above. If you wish to allow use of your version of this file only # under the terms of either the GPL or the LGPL, and not to allow others to # use your version of this file under the terms of the MPL, indicate your # decision by deleting the provisions above and replace them with the notice # and other provisions required by the GPL or the LGPL. If you do not delete # the provisions above, a recipient may use your version of this file under # the terms of any one of the MPL, the GPL or the LGPL. # # ##### END LICENSE BLOCK ##### import re ## Mandatory Properties ## name = 'Slovene' code = 'sl' # ASCII is used except q and w, x and y according to Wikipedia. use_ascii = False charsets = ['ISO-8859-2', 'ISO-8859-16', 'Windows-1250', 'IBM852', 'MAC-CENTRALEUROPE'] ## Optional Properties ## # Alphabet characters. # XXX According to Wikipedia there are 2 incompatible diacritics # systems, but both seem barely used on Wikipedia (even though I can see # some usage here or there, on some titles), so I assume these don't # exist. Maybe this would have to be separate models for variants of # the language. alphabet = 'abcčdefghijklmnoprsštuvzž' #alphabet = 'abcčdefghijklmnoprsštuvzž' + 'áȃȁéêèẹ́ȇẹ̑ȅə̀ə̏íìȋȉóôòóọ́ȏọ̑ȍúùȗȕŕȓ' # Equivalent letters. This is because Slovene use some diacritics but # they are so rarely used (in title mostly for Wikipedia) that counting # them in the stats would be counter-productive. Moreover they are not # letter of their own, but really replace the non-diatrical letter to # help with disambiguition. For instance "gol" both means "naked" and # "goal" and could sometimes be written "gòl" for the former meaning or # "gól" for the latter. alphabet_mapping = {'a': 'áȃȁ', 'e': 'éêèẹ́ȇ ẹ̑ȅə̀ə̏', 'i': 'íìȋȉ', 'o': 'óôòóọ́ȏ ọ̑ȍ', 'u': 'úùȗȕ', 'r': 'ŕȓ'} start_pages = ['Ljubljana'] wikipedia_code = code case_mapping = True