From 5ee1c3ee398add498a19d92080adf0f34aa7bb33 Mon Sep 17 00:00:00 2001 From: Jehan Date: Fri, 4 Dec 2015 02:35:09 +0100 Subject: [PATCH] LangModels: adding Turkish models for ISO-8859-3 and ISO-8859-9. --- README.md | 3 + .../BuildLangModelLogs/LangTurkishModel.log | 113 ++++++++++++ src/CMakeLists.txt | 1 + src/LangModels/LangTurkishModel.cpp | 173 ++++++++++++++++++ src/nsSBCSGroupProber.cpp | 2 + src/nsSBCSGroupProber.h | 2 +- src/nsSBCharSetProber.h | 3 + test/tr/iso-8859-3.txt | 13 ++ test/tr/iso-8859-9.txt | 13 ++ 9 files changed, 322 insertions(+), 1 deletion(-) create mode 100644 script/BuildLangModelLogs/LangTurkishModel.log create mode 100644 src/LangModels/LangTurkishModel.cpp create mode 100644 test/tr/iso-8859-3.txt create mode 100644 test/tr/iso-8859-9.txt diff --git a/README.md b/README.md index d427c29..5169b9a 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,9 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj * Hungarian: * ISO-8859-2 * WINDOWS-1250 + * Turkish: + * ISO-8859-3 + * ISO-8859-9 * Others * WINDOWS-1252 diff --git a/script/BuildLangModelLogs/LangTurkishModel.log b/script/BuildLangModelLogs/LangTurkishModel.log new file mode 100644 index 0000000..51b31ad --- /dev/null +++ b/script/BuildLangModelLogs/LangTurkishModel.log @@ -0,0 +1,113 @@ += Logs of language model for Turkish (tr) = + +- Generated by BuildLangModel.py +- Started: 2015-12-04 02:22:03.929245 +- Maximum depth: 3 +- Max number of pages: 50 + +== Parsed pages == + +Ana_Sayfa (revision 16293313) +1048 (revision 12894005) +1131 (revision 14840814) +16. yüzyıl (revision 15185081) +1859 (revision 16014427) +1866 (revision 16120346) +1869 (revision 12888270) +1892 (revision 13955858) +1895 (revision 15334635) +1902 (revision 16283638) +1906 (revision 15874323) +1918 (revision 16099474) +1926 (revision 16180584) +1927 (revision 15370980) +1940 (revision 15370990) +1943 (revision 16091797) +1944 (revision 16247827) +1945 (revision 16281147) +1948 (revision 15443886) +1961 (revision 15799529) +1964 (revision 16085332) +1975 (revision 15006928) +1980 (revision 16213240) +1981 (revision 16295456) +1983 (revision 16327128) +1993 (revision 16300456) +2002 (revision 16297206) +2015 (revision 16328338) +24 Ekim (revision 16213661) +4 Aralık (revision 16341162) +ABD (revision 16325951) +ABD Senatosu (revision 15970439) +Adam Horowitz (revision 14362106) +Akçe (revision 16261547) +Altın Takım (revision 13503001) +American Broadcasting Company (revision 16055235) +Amerika BirleÅŸik Devletleri (revision 16325951) +Ana Sayfa/KardeÅŸ projeler (revision 16293313) +Ana Sayfa/Kategoriler (revision 16293313) +Aptullah Kuran (revision 15744893) +Avrupa (revision 16299756) +Ayasofya (revision 16305207) +BM Güvenlik Konseyi (revision 16085518) +BirleÅŸmiÅŸ Milletler (revision 16258474) +BudapeÅŸte (revision 16219173) +CIA (revision 16054325) +Charlie Pace (revision 16129416) +Cuma (revision 14197127) +Desmond Hume (revision 16035300) +DiÄŸerleri (Lost) (revision 16329444) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2015-12-04 02:24:44.728803 + +48 characters appeared 267623 times. + +First 36 characters: +[ 0] Char a: 12.311722086666691 % +[ 1] Char e: 8.716365932673948 % +[ 2] Char i: 8.507863673899479 % +[ 3] Char n: 7.322987934519828 % +[ 4] Char r: 6.979220769515326 % +[ 5] Char l: 6.609297407173524 % +[ 6] Char ı: 4.514933320379788 % +[ 7] Char d: 4.3475336574210734 % +[ 8] Char t: 4.2634601659797555 % +[ 9] Char k: 4.240293248338147 % +[10] Char s: 3.929781819948211 % +[11] Char m: 3.429451130881875 % +[12] Char u: 3.0998830444319063 % +[13] Char y: 2.9212735826143494 % +[14] Char o: 2.7135186437638024 % +[15] Char b: 2.3129551645411643 % +[16] Char ü: 1.8305601536489764 % +[17] Char ÅŸ: 1.5988909772328985 % +[18] Char z: 1.2267256551193282 % +[19] Char h: 1.1983274980102607 % +[20] Char v: 1.194964558352608 % +[21] Char c: 1.143773143563894 % +[22] Char g: 1.1004285879763698 % +[23] Char p: 1.0178497363828969 % +[24] Char ç: 0.8295251155543433 % +[25] Char ÄŸ: 0.8205572764672693 % +[26] Char f: 0.7047226882592303 % +[27] Char ö: 0.6710932916827029 % +[28] Char j: 0.1296600068006113 % +[29] Char w: 0.11359262843627041 % +[30] Char â: 0.07846859201189733 % +[31] Char î: 0.04147625577771716 % +[32] Char x: 0.024287897527492032 % +[33] Char é: 0.014946398478456635 % +[34] Char q: 0.01083613889688106 % +[35] Char û: 0.009341499049035397 % + +The first 36 characters have an accumulated ratio of 0.99980569681978. + +935 sequences found. + +First 512 (typical positive ratio): 0.991865243864388 +Next 512 (512-1024): 3.7365996196141585e-06 +Rest: 2.949029909160572e-17 + +- Processing end: 2015-12-04 02:24:44.883537 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7ab910b..2b57bde 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -17,6 +17,7 @@ set( LangModels/LangHungarianModel.cpp LangModels/LangHebrewModel.cpp LangModels/LangThaiModel.cpp + LangModels/LangTurkishModel.cpp nsHebrewProber.cpp nsCharSetProber.cpp nsBig5Prober.cpp diff --git a/src/LangModels/LangTurkishModel.cpp b/src/LangModels/LangTurkishModel.cpp new file mode 100644 index 0000000..e68bcf6 --- /dev/null +++ b/src/LangModels/LangTurkishModel.cpp @@ -0,0 +1,173 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Turkish *********/ + +/** + * Generated by BuildLangModel.py + * On: 2015-12-04 02:24:44.730727 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_3_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 15, 21, 7, 1, 26, 22, 19, 6, 28, 9, 5, 11, 3, 14, /* 4X */ + 23, 34, 4, 10, 8, 12, 20, 29, 32, 13, 18,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 15, 21, 7, 1, 26, 22, 19, 2, 28, 9, 5, 11, 3, 14, /* 6X */ + 23, 34, 4, 10, 8, 12, 20, 29, 32, 13, 18,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 48,SYM,SYM,SYM,ILL, 49,SYM,SYM, 2, 17, 25, 50,SYM,ILL, 51, /* AX */ + SYM, 52,SYM,SYM,SYM,SYM, 53,SYM,SYM, 6, 17, 25, 54,SYM,ILL, 55, /* BX */ + 41, 36, 30,ILL, 39, 56, 57, 24, 42, 33, 58, 45, 59, 37, 31, 60, /* CX */ + ILL, 47, 61, 38, 62, 63, 27,SYM, 64, 65, 40, 35, 16, 66, 67, 68, /* DX */ + 41, 36, 30,ILL, 39, 69, 70, 24, 42, 33, 71, 45, 72, 37, 31, 73, /* EX */ + ILL, 47, 74, 38, 75, 76, 27,SYM, 77, 78, 40, 35, 16, 79, 80,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_9_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 15, 21, 7, 1, 26, 22, 19, 6, 28, 9, 5, 11, 3, 14, /* 4X */ + 23, 34, 4, 10, 8, 12, 20, 29, 32, 13, 18,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 15, 21, 7, 1, 26, 22, 19, 2, 28, 9, 5, 11, 3, 14, /* 6X */ + 23, 34, 4, 10, 8, 12, 20, 29, 32, 13, 18,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 81,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 41, 36, 30, 44, 39, 82, 46, 24, 42, 33, 83, 45, 84, 37, 31, 85, /* CX */ + 25, 47, 86, 38, 87, 88, 27,SYM, 43, 89, 40, 35, 16, 2, 17, 90, /* DX */ + 41, 36, 30, 44, 39, 91, 46, 24, 42, 33, 92, 45, 93, 37, 31, 94, /* EX */ + 25, 47, 95, 38, 96, 97, 27,SYM, 43, 98, 40, 35, 16, 6, 17, 99, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 935 + * First 512 sequences: 0.991865243864388 + * Next 512 sequences (512-1024): 0.008134756135611957 + * Rest: 2.949029909160572e-17 + * Negative sequences: TODO + */ +static const PRUint8 TurkishLangModel[] = +{ + 3,2,3,3,3,3,2,3,3,3,3,3,3,3,2,3,0,3,3,3,3,3,3,3,3,3,3,0,3,3,0,2,2,2,2,0, + 3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,3,2,0,3,0,2,0, + 3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,2,2,2,0,2,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,0,3,2,2,2,2,2,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,2,2,2,2,2,2,2, + 3,3,3,2,2,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,2,3,0,3,2,2,2,2,3,0,2,2,2, + 3,2,0,3,3,3,3,3,3,3,3,3,2,3,2,3,0,3,3,2,3,3,2,3,2,3,2,0,0,0,0,0,2,0,0,0, + 3,3,3,2,3,3,3,3,2,2,2,2,3,3,3,2,3,0,2,2,2,2,2,2,0,0,0,3,2,3,2,2,0,0,0,0, + 3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,3,3,2,2,2,3,0,2,3,2,2,3,2,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,2,2,2,2,3,0,2,3,2,2,3,0,0,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,3,3,3,2,3,3,0,2,3,0,2,2,0,0,2,2,2, + 3,3,3,2,3,3,3,3,2,2,3,3,3,3,3,3,3,2,3,3,0,3,2,3,2,0,2,2,0,2,3,2,2,2,2,2, + 3,3,3,3,3,3,0,3,3,3,3,3,2,3,2,3,0,3,3,3,3,3,3,3,3,3,2,0,2,2,0,0,2,2,0,0, + 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,2,2,2,3,2,2,0,2,3,0,2,2,0,0,2,0,2, + 3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,3,0,2,2,2,0,0, + 3,3,3,3,3,3,3,3,0,2,2,3,3,3,3,3,3,0,2,2,2,2,0,2,0,0,0,3,2,2,2,0,0,2,0,0, + 2,2,2,3,3,3,0,3,3,3,3,3,0,3,2,3,0,3,3,3,3,3,2,3,3,3,3,0,2,0,0,0,0,0,0,0, + 3,3,3,0,2,3,3,2,3,3,2,3,3,2,2,3,3,2,0,2,2,2,2,2,3,0,2,2,0,0,2,2,0,0,0,0, + 3,3,3,2,2,3,3,3,2,2,0,3,3,3,3,2,3,0,2,2,0,3,3,0,0,0,0,2,0,0,2,2,0,0,0,0, + 3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,3,2,2,2,2,2,0,2,3,0,2,0,0,2,3,2,0,2,0,2, + 3,3,3,2,3,3,2,2,0,2,3,2,3,3,3,2,2,2,2,2,3,2,2,0,0,0,2,0,0,0,2,2,0,0,0,0, + 3,3,3,2,3,3,3,2,3,3,2,2,3,2,3,2,3,0,2,3,0,2,0,0,0,0,0,2,0,0,2,0,0,2,2,2, + 3,3,3,2,3,3,3,2,2,2,2,0,3,2,3,0,3,0,2,3,2,0,2,2,0,0,2,3,2,2,2,0,0,2,0,0, + 3,3,3,0,3,3,3,2,3,2,3,3,3,2,3,2,2,0,2,3,0,2,2,3,2,0,2,0,0,2,2,0,2,2,0,0, + 3,3,3,0,2,3,3,2,3,2,0,3,3,2,3,2,3,2,0,0,0,0,2,2,0,0,0,3,0,0,0,0,0,0,0,0, + 3,3,3,0,3,3,3,3,0,0,0,3,3,0,0,2,3,2,2,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,2,3,3,3,2,3,2,2,0,3,3,3,2,2,0,0,2,0,2,2,0,2,0,2,2,2,0,2,2,0,0,0,0, + 0,0,0,3,3,3,0,3,3,3,3,3,0,3,0,2,0,2,3,2,2,0,0,2,3,3,2,0,2,0,0,0,0,0,0,0, + 3,3,3,0,0,2,2,2,0,2,0,0,3,0,3,0,2,0,0,0,0,2,2,2,0,0,0,2,0,0,2,0,0,0,0,0, + 3,3,3,2,2,2,0,0,0,2,2,2,2,2,3,2,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,2,0,0, + 0,0,2,3,3,3,0,3,2,2,2,2,0,2,0,2,0,2,2,3,2,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0, + 0,0,0,2,0,2,0,2,2,0,0,2,0,2,0,0,0,2,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, + 3,2,2,0,0,0,2,0,2,0,0,0,0,2,2,0,0,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,0,0,0,0, + 2,0,2,2,2,2,0,2,2,0,2,2,2,0,0,2,0,0,2,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0, + 2,0,2,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,2,2,0,2,0,0,2,2,0,0,0,2,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_3TurkishModel = +{ + Iso_8859_3_CharToOrderMap, + TurkishLangModel, + 36, + (float)0.991865243864388, + PR_FALSE, + "ISO-8859-3" +}; + +const SequenceModel Iso_8859_9TurkishModel = +{ + Iso_8859_9_CharToOrderMap, + TurkishLangModel, + 36, + (float)0.991865243864388, + PR_FALSE, + "ISO-8859-9" +}; \ No newline at end of file diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 210831b..69281c7 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -90,6 +90,8 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[21] = new nsSingleByteCharSetProber(&Iso_8859_3EsperantoModel); + mProbers[22] = new nsSingleByteCharSetProber(&Iso_8859_3TurkishModel); + mProbers[23] = new nsSingleByteCharSetProber(&Iso_8859_9TurkishModel); Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index dbe3650..575e93f 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 22 +#define NUM_OF_SBCS_PROBERS 24 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 63ce080..dde4ec9 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -152,5 +152,8 @@ extern const SequenceModel Windows_1252GermanModel; extern const SequenceModel Iso_8859_3EsperantoModel; +extern const SequenceModel Iso_8859_3TurkishModel; +extern const SequenceModel Iso_8859_9TurkishModel; + #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/test/tr/iso-8859-3.txt b/test/tr/iso-8859-3.txt new file mode 100644 index 0000000..0cb6dc0 --- /dev/null +++ b/test/tr/iso-8859-3.txt @@ -0,0 +1,13 @@ +Türkçe, Türk dili ya da Türkiye Türkçesi, bat¹da Balkanlar'dan baºlay¹p do»uda +Hazar Denizi sahas¹na kadar konuºulan Altay dillerinden biridir. Yaº¹, en eski +hesaplara göre 8500 olan Türkçe, bugün yaºayan Dünya dilleri aras¹nda en eski +yaz¹l¹ belgelere sahip olan dildir. Bu belgeler, çivi yaz¹l¹ Sümerce +tabletlerdeki al¹nt¹ kelimelerdir.[12] Türk yaz¹ dilleri içinde O»uz sahas¹ yaz¹ +dillerinden Osmanl¹ Türkçesinin devam¹n¹ oluºturur. Baºta Türkiye olmak üzere +eski Osmanl¹ ©mparatorlu»u co»rafyas¹nda konuºulan Türkçe, dünyada en fazla +konuºulan 5. dildir. Türkçe sondan eklemeli bir dildir.[13] Bundan ötürü +kullan¹lan herhangi bir eylem üzerinden istenildi»i kadar sözcük +türetilebilir.[14] Türkiye Türkçesi bu yönünden dolay¹ di»er Türk dilleriyle +ortak ya da ayr¹k bulunan onlarca eke sahiptir.[15] Türkçe çok geniº +kullan¹m¹yla birlikte zengin bir dil olmas¹n¹n yan¹ s¹ra, genel itibar¹yla +"özne-nesne-yüklem" biçimindeki cümle kuruluºuna sahiptir. diff --git a/test/tr/iso-8859-9.txt b/test/tr/iso-8859-9.txt new file mode 100644 index 0000000..4a69aa3 --- /dev/null +++ b/test/tr/iso-8859-9.txt @@ -0,0 +1,13 @@ +Türkçe, Türk dili ya da Türkiye Türkçesi, batýda Balkanlar'dan baþlayýp doðuda +Hazar Denizi sahasýna kadar konuþulan Altay dillerinden biridir. Yaþý, en eski +hesaplara göre 8500 olan Türkçe, bugün yaþayan Dünya dilleri arasýnda en eski +yazýlý belgelere sahip olan dildir. Bu belgeler, çivi yazýlý Sümerce +tabletlerdeki alýntý kelimelerdir.[12] Türk yazý dilleri içinde Oðuz sahasý yazý +dillerinden Osmanlý Türkçesinin devamýný oluþturur. Baþta Türkiye olmak üzere +eski Osmanlý Ýmparatorluðu coðrafyasýnda konuþulan Türkçe, dünyada en fazla +konuþulan 5. dildir. Türkçe sondan eklemeli bir dildir.[13] Bundan ötürü +kullanýlan herhangi bir eylem üzerinden istenildiði kadar sözcük +türetilebilir.[14] Türkiye Türkçesi bu yönünden dolayý diðer Türk dilleriyle +ortak ya da ayrýk bulunan onlarca eke sahiptir.[15] Türkçe çok geniþ +kullanýmýyla birlikte zengin bir dil olmasýnýn yaný sýra, genel itibarýyla +"özne-nesne-yüklem" biçimindeki cümle kuruluþuna sahiptir.