diff --git a/script/BuildLangModelLogs/LangGermanModel.log b/script/BuildLangModelLogs/LangGermanModel.log new file mode 100644 index 0000000..9115d29 --- /dev/null +++ b/script/BuildLangModelLogs/LangGermanModel.log @@ -0,0 +1,159 @@ += Logs of language model for German (de) = + +- Generated by BuildLangModel.py +- Started: 2015-12-03 22:42:29.154759 +- Maximum depth: 3 +- Max number of pages: 100 + +== Parsed pages == + +Wikipedia:Hauptseite (revision 140459035) +1740 (revision 145584733) +1890 (revision 148575121) +1925 (revision 148682812) +1965 (revision 148411693) +3. Dezember (revision 148684818) +Bundeswehreinsatz in Syrien (revision 148714599) +Clara Klabunde (revision 148697193) +Day Tripper (revision 145956669) +Dezember 2015 (revision 148713161) +Edwar al-Charrat (revision 148656295) +Enzyklika (revision 148704406) +Enzyklopädie (revision 148364925) +Facebook Inc. (revision 148280344) +Franz Neubauer (CSU) (revision 148710968) +Freie Inhalte (revision 148123311) +Gabriele Ferzetti (revision 148715582) +Georg von Waldburg zu Zeil und Trauchburg (revision 148710609) +Jim Loscutoff (revision 148690370) +Katarina Witt (revision 148713884) +Klavierkonzert (Gershwin) (revision 143900338) +Ludolf Camphausen (revision 145088962) +Mark Zuckerberg (revision 148714452) +Montenegro (revision 148692773) +NATO (revision 148697872) +NATO-Osterweiterung (revision 148697354) +Nekrolog 2015 (revision 148711617) +Peter-Ulrich-Haus (revision 148654149) +Philanthropie (revision 145561255) +Präsidentschaftswahl in Burkina Faso 2015 (revision 148677453) +Québec (Stadt) (revision 148716893) +Rivka Zohar (revision 148708850) +Roch Marc Kaboré (revision 148673951) +Rubber Soul (revision 148665720) +Salve Regina (Latry) (revision 148713279) +Schießerei in San Bernardino (revision 148711974) +Single (Musik) (revision 146450210) +The Giving Pledge (revision 148711856) +Ubi primum (Benedikt XIV.) (revision 136691297) +VTech (revision 148704025) +Walter Damrosch (revision 148716127) +We Can Work It Out (revision 148706519) +1. August (revision 148089156) +1. Januar (revision 148659041) +1. Juni (revision 148375663) +1. November (revision 147888516) +10. August (revision 148079904) +10. November (revision 148658709) +10. September (revision 148201788) +11. August (revision 148315737) +11. Oktober (revision 148087353) +12. Januar (revision 147377586) +12. September (revision 148359994) +13. Dezember (revision 148614781) +13. September (revision 148320520) +14. August (revision 148513270) +14. Dezember (revision 147968142) +15. April (revision 146544147) +15. August (revision 147827975) +16. April (revision 148712866) +16. Dezember (revision 148392316) +16. Februar (revision 148221712) +16. Jahrhundert (revision 147390194) +16. Juli (revision 147928181) +1652 (revision 142931287) +1654 (revision 145531451) +1656 (revision 144194148) +1657 (revision 147492859) +1662 (revision 147548355) +1665 (revision 147757128) +1666 (revision 147843417) +1667 (revision 148566099) +1668 (revision 145304760) +1670 (revision 147643990) +1672 (revision 145296252) +1673 (revision 147879655) +1674 (revision 146784434) +1679 (revision 146069377) +1685 (revision 148596629) +1688 (revision 140370621) +1692 (revision 146892539) +1693 (revision 147464373) +17. August (revision 148288443) +17. Februar (revision 145814425) +17. Jahrhundert (revision 147869798) +17. Oktober (revision 148327370) +1700er (revision 127393249) +1707 (revision 148288721) +1710er (revision 134739897) +1720er (revision 127302296) +1730 (revision 148694277) +1730er (revision 127393280) +1731 (revision 147730204) +1735 (revision 145436596) +1736 (revision 145680122) +1737 (revision 146645905) +1738 (revision 145094942) +1739 (revision 147843445) +1740er (revision 127393296) +1741 (revision 146530178) +1742 (revision 147010984) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2015-12-03 22:50:46.517106 + +59 characters appeared 1746165 times. + +First 31 characters: +[ 0] Char e: 14.27997926885489 % +[ 1] Char r: 8.696257226550754 % +[ 2] Char n: 8.464091308667852 % +[ 3] Char i: 8.258784250056554 % +[ 4] Char s: 6.690833913175444 % +[ 5] Char a: 6.370703799469123 % +[ 6] Char t: 5.925728668253001 % +[ 7] Char h: 4.540979804314025 % +[ 8] Char d: 4.367284878576767 % +[ 9] Char l: 4.083634708060234 % +[10] Char u: 3.899917819908199 % +[11] Char o: 3.6450163644329145 % +[12] Char c: 3.392405643223865 % +[13] Char m: 2.578565026787274 % +[14] Char g: 2.543631329227192 % +[15] Char b: 1.9455206123132693 % +[16] Char k: 1.7604292836014925 % +[17] Char f: 1.6422273954637734 % +[18] Char p: 1.519329502080273 % +[19] Char w: 1.0273370500496803 % +[20] Char z: 1.0037997554641171 % +[21] Char v: 0.9010603236234834 % +[22] Char ä: 0.4926224039538073 % +[23] Char j: 0.4661644231787947 % +[24] Char ü: 0.4094687500894818 % +[25] Char y: 0.34229296773214446 % +[26] Char ö: 0.3044958523392692 % +[27] Char ß: 0.14477440562604335 % +[28] Char x: 0.09918879372796958 % +[29] Char é: 0.07633871942227682 % +[30] Char q: 0.06099079983850323 % + +The first 31 characters have an accumulated ratio of 0.9993385504806246. + +1188 sequences found. + +First 512 (typical positive ratio): 0.9934041448127945 +Next 512 (512-1024): 1.1453671331174316e-06 +Rest: 0.0001130256702826099 + +- Processing end: 2015-12-03 22:50:46.681265 diff --git a/script/langs/de.py b/script/langs/de.py new file mode 100644 index 0000000..554f142 --- /dev/null +++ b/script/langs/de.py @@ -0,0 +1,78 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'German' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'de' +# ASCII characters are also used in French. +use_ascii = True +# The charsets we want to support and create data for. +charsets = ['ISO-8859-1', 'WINDOWS-1252'] + +## Optional Properties ## + +# Alphabet characters. +# If use_ascii=True, there is no need to add any ASCII characters. +# If case_mapping=True, there is no need to add several cases of a same +# character (provided Python algorithms know the right cases). +alphabet = ['ä', 'ö', 'ü', 'ß'] +# The start page. Though optional, it is advised to choose one yourself. +start_pages = ['Wikipedia:Hauptseite'] +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True + +# A function to clean content returned by the `wikipedia` python lib, +# in case some unwanted data has been overlooked. +def clean_wikipedia_content(content): + # Get rid of title syntax: "=== Articles connexes ===" + cleaned = re.sub(r'(=+) *([^=]+) *\1', + r'\2', + content) + return cleaned diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b3e641a..c8fae7b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -11,6 +11,7 @@ set( LangModels/LangBulgarianModel.cpp LangModels/LangCyrillicModel.cpp LangModels/LangFrenchModel.cpp + LangModels/LangGermanModel.cpp LangModels/LangGreekModel.cpp LangModels/LangHungarianModel.cpp LangModels/LangHebrewModel.cpp diff --git a/src/LangModels/LangGermanModel.cpp b/src/LangModels/LangGermanModel.cpp new file mode 100644 index 0000000..7a2436b --- /dev/null +++ b/src/LangModels/LangGermanModel.cpp @@ -0,0 +1,168 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: German *********/ + +/** + * Generated by BuildLangModel.py + * On: 2015-12-03 22:50:46.518374 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Windows_1252_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 5, 15, 12, 8, 0, 17, 14, 7, 3, 23, 16, 9, 13, 2, 11, /* 4X */ + 18, 30, 1, 4, 6, 10, 21, 19, 28, 25, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 5, 15, 12, 8, 0, 17, 14, 7, 3, 23, 16, 9, 13, 2, 11, /* 6X */ + 18, 30, 1, 4, 6, 10, 21, 19, 28, 25, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 59,SYM,SYM,SYM,SYM,SYM,SYM, 36,SYM, 54,ILL, 42,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 36,SYM, 54,ILL, 42, 56, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 60,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 41, 31, 37, 44, 22, 49, 50, 35, 32, 29, 48, 43, 57, 33, 47, 52, /* CX */ + 53, 39, 51, 34, 40, 55, 26,SYM, 38, 58, 46, 61, 24, 45, 62, 27, /* DX */ + 41, 31, 37, 44, 22, 49, 50, 35, 32, 29, 48, 43, 57, 33, 47, 52, /* EX */ + 53, 39, 51, 34, 40, 55, 26,SYM, 38, 58, 46, 63, 24, 45, 64, 56, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_1_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 5, 15, 12, 8, 0, 17, 14, 7, 3, 23, 16, 9, 13, 2, 11, /* 4X */ + 18, 30, 1, 4, 6, 10, 21, 19, 28, 25, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 5, 15, 12, 8, 0, 17, 14, 7, 3, 23, 16, 9, 13, 2, 11, /* 6X */ + 18, 30, 1, 4, 6, 10, 21, 19, 28, 25, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 65,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 41, 31, 37, 44, 22, 49, 50, 35, 32, 29, 48, 43, 57, 33, 47, 52, /* CX */ + 53, 39, 51, 34, 40, 55, 26,SYM, 38, 58, 46, 66, 24, 45, 67, 27, /* DX */ + 41, 31, 37, 44, 22, 49, 50, 35, 32, 29, 48, 43, 57, 33, 47, 52, /* EX */ + 53, 39, 51, 34, 40, 55, 26,SYM, 38, 58, 46, 68, 24, 45, 69, 56, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 1188 + * First 512 sequences: 0.9934041448127945 + * Next 512 sequences (512-1024): 0.006482829516922903 + * Rest: 0.0001130256702826099 + * Negative sequences: TODO + */ +static const PRUint8 GermanLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,3,3,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,2,3,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,1,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,2,3,3,2,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,0,0,3,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,3,0,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,3,0,0,2,2, + 3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,3,3,3,2,3,2,2,3,2,3,3,3,0,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,2,3,2,2,3,2,3,3,2,0,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,2,2, + 3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,3,3,3,3,2,2,2,2,0,3,3,3,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,3,0,3,3,1,2, + 3,3,2,3,2,3,3,3,2,3,3,3,3,2,2,2,3,2,2,2,2,2,2,2,1,3,2,0,1,2,3, + 3,3,2,3,3,3,3,2,3,3,3,3,3,3,2,3,2,3,3,2,2,2,3,2,3,3,3,0,0,2,2, + 3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,3,2,2,2,3,2,3,2,3,3,2,0,2,2,1, + 3,3,3,3,3,3,3,3,2,3,3,3,2,2,3,3,2,2,2,2,2,2,3,2,3,3,3,0,0,2,0, + 3,3,3,3,3,3,3,3,2,3,3,3,1,3,2,2,3,3,3,2,2,2,3,2,3,3,3,0,1,2,1, + 3,3,3,3,3,3,3,2,2,3,3,3,2,3,3,2,3,3,2,2,2,2,3,2,3,2,3,0,0,2,0, + 3,3,2,3,3,3,3,3,3,3,3,3,2,2,2,2,2,3,3,2,2,2,3,2,2,2,2,0,0,2,0, + 3,3,3,3,3,3,2,2,2,2,3,3,1,2,2,2,2,2,2,2,2,2,3,3,3,2,3,0,0,0,0, + 3,2,2,3,3,3,3,2,2,3,3,3,2,3,2,3,2,2,2,3,3,2,2,2,3,3,3,0,0,2,2, + 3,2,2,3,2,3,2,0,2,2,2,3,1,2,2,2,2,2,2,2,2,2,2,1,0,2,3,0,0,2,1, + 2,3,3,3,3,2,3,3,3,3,3,2,3,3,3,2,2,3,2,0,2,2,0,0,0,0,0,2,0,0,2, + 3,2,2,3,2,3,2,2,2,2,3,3,2,2,2,1,2,1,2,0,2,0,3,2,3,2,2,0,0,2,0, + 2,3,3,0,3,1,3,3,3,3,0,0,3,2,3,3,2,2,2,1,1,0,0,0,0,0,0,2,0,0,0, + 3,3,3,2,3,3,2,2,2,3,2,3,3,3,2,2,3,2,3,2,2,2,0,2,2,2,1,0,0,1,0, + 2,3,3,2,3,0,3,3,2,3,0,1,3,3,3,2,2,3,2,2,2,2,0,0,0,0,1,3,1,0,0, + 3,2,2,3,2,2,3,2,1,2,2,2,0,2,2,3,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0, + 3,1,2,3,1,3,3,2,1,2,2,2,2,0,0,2,2,2,3,2,0,2,0,0,0,2,0,0,2,2,0, + 2,3,2,0,2,2,2,2,2,2,2,2,2,2,2,3,2,2,2,1,2,2,0,2,0,0,0,0,0,0,2, + 0,1,0,2,0,2,0,0,0,0,3,2,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Windows_1252GermanModel = +{ + Windows_1252_CharToOrderMap, + GermanLangModel, + 31, + (float)0.9934041448127945, + PR_TRUE, + "WINDOWS-1252" +}; + +const SequenceModel Iso_8859_1GermanModel = +{ + Iso_8859_1_CharToOrderMap, + GermanLangModel, + 31, + (float)0.9934041448127945, + PR_TRUE, + "ISO-8859-1" +}; \ No newline at end of file diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 0f3cbb2..c4bd868 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -85,6 +85,9 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[17] = new nsSingleByteCharSetProber(&Latin2HungarianModel); mProbers[18] = new nsSingleByteCharSetProber(&Win1250HungarianModel); + mProbers[19] = new nsSingleByteCharSetProber(&Iso_8859_1GermanModel); + mProbers[20] = new nsSingleByteCharSetProber(&Windows_1252GermanModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index 8dd25e0..c2a8768 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 19 +#define NUM_OF_SBCS_PROBERS 21 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 3af8c60..23c6cbb 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -139,6 +139,8 @@ extern const SequenceModel TIS620ThaiModel; extern const SequenceModel Iso_8859_15FrenchModel; extern const SequenceModel Iso_8859_1FrenchModel; extern const SequenceModel Windows_1252FrenchModel; +extern const SequenceModel Iso_8859_1GermanModel; +extern const SequenceModel Windows_1252GermanModel; #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/test/de/iso-8859-1.txt b/test/de/iso-8859-1.txt new file mode 100644 index 0000000..726a6c8 --- /dev/null +++ b/test/de/iso-8859-1.txt @@ -0,0 +1,11 @@ +ISO 8859-1, genauer ISO/IEC 8859-1, auch bekannt als Latin-1, ist ein von der +ISO zuletzt 1998 aktualisierter Standard für die Informationstechnik zur +Zeichenkodierung mit acht Bit und der erste Teil der Normenfamilie ISO/IEC 8859. + +Die mit sieben Bit kodierbaren Zeichen entsprechen US-ASCII mit führendem +Nullbit. Zusätzlich zu den 95 darstellbaren ASCII-Zeichen (2016-7E16) kodiert +ISO 8859-1 96 weitere (A016-FF16), also insgesamt 191 von theoretisch möglichen +256 (= 28). Den Positionen 0016-1F16 und 7F16-9F16 sind in ISO/IEC 8859 und +damit ISO/IEC 8859-1 keine Zeichen zugewiesen. Die von der IANA definierte +Bezeichnung ISO-8859-1 (mit Bindestrich) steht für die Kombination der Zeichen +dieser Norm mit nicht darstellbaren Steuerzeichen gemäß ISO/IEC 6429. diff --git a/test/de/windows-1252.txt b/test/de/windows-1252.txt new file mode 100644 index 0000000..7c51f46 --- /dev/null +++ b/test/de/windows-1252.txt @@ -0,0 +1,11 @@ +ISO 8859-1, genauer ISO/IEC 8859-1, auch bekannt als Latin-1, ist ein von der +ISO zuletzt 1998 aktualisierter Standard für die Informationstechnik zur +Zeichenkodierung mit acht Bit und der erste Teil der Normenfamilie ISO/IEC 8859. + +Die mit sieben Bit kodierbaren Zeichen entsprechen US-ASCII mit führendem +Nullbit. Zusätzlich zu den 95 darstellbaren ASCII-Zeichen (2016–7E16) kodiert +ISO 8859-1 96 weitere (A016–FF16), also insgesamt 191 von theoretisch möglichen +256 (= 28). Den Positionen 0016–1F16 und 7F16–9F16 sind in ISO/IEC 8859 und +damit ISO/IEC 8859-1 keine Zeichen zugewiesen. Die von der IANA definierte +Bezeichnung ISO-8859-1 (mit Bindestrich) steht für die Kombination der Zeichen +dieser Norm mit nicht darstellbaren Steuerzeichen gemäß ISO/IEC 6429.