script, src: add generic Korean model.

Until now, Korean charsets had its own probers as there are no single-byte encoding for writing Korean. I now added a Korean model only for the generic character and sequence statistics. I also improved the generation script (script/BuildLangModel.py) to allow for languages without single-byte charset generation and to provide meaningful statistics even when the language script has a lot of characters (so we can't have a full sequence combination array, just too much data). It's not perfect yet. For instance our UTF-8 Korean test file ends up with confidence of 0.38503, which is low for obvious Korean text. Still it works (correctly detected, with top confidence compared to others) and is a first step toward more improvement for detection confidence.
2026-02-05 17:30:09 +08:00 · 2021-03-18 17:51:22 +01:00 · 2021-03-18 17:51:22 +01:00 · 629bc879f3
commit 629bc879f3
parent 0d152ff430
8 changed files with 2221 additions and 39 deletions
--- a/script/BuildLangModel.py
+++ b/script/BuildLangModel.py
@ -115,6 +115,10 @@ if not hasattr(lang, 'custom_case_mapping'):
    lang.custom_case_mapping = None
 if not hasattr(lang, 'alphabet') or lang.alphabet is None:
    lang.alphabet = None
+if not hasattr(lang, 'unicode_ranges') or lang.unicode_ranges is None:
+    lang.unicode_ranges = None
+if not hasattr(lang, 'frequent_ranges') or lang.frequent_ranges is None:
+    lang.frequent_ranges = None

 def local_lowercase(text, lang):
    lowercased = ''
@ -151,6 +155,28 @@ if lang.alphabet is not None:
                #alphabet.append(l)
    lang.alphabet = list(set(lang.alphabet))

+def normalize_codepoint_ranges(input_range):
+  output_range = []
+  if input_range is not None:
+      for start, end in input_range:
+        # Allow to write down characters rather than unicode values.
+        if isinstance(start, str):
+          start = ord(start)
+        if isinstance(end, str):
+          end = ord(end)
+        if not isinstance(start, int) or not isinstance(end, int):
+          sys.stderr.write("Expected unicode range in char or int: {}-{}.\n".format(start, end))
+        if start > end:
+          sys.stderr.write("Wrong unicode range: {}-{}.\n".format(start, end))
+        else:
+          output_range += [(start, end)]
+      if len(output_range) == 0:
+        output_range = None
+  return output_range
+
+lang.unicode_ranges = normalize_codepoint_ranges(lang.unicode_ranges)
+lang.frequent_ranges = normalize_codepoint_ranges(lang.frequent_ranges)
+
 # Starting processing.
 wikipedia.set_lang(lang.wikipedia_code)

@ -187,10 +213,17 @@ def process_text(content, lang):
    # In python 3, strings are UTF-8.
    # Looping through them return expected characters.
    for char in content:
+        unicode_value = ord(char)
        is_letter = False
-        if ord(char) in characters:
-            characters[ord(char)] += 1
+        if unicode_value in characters:
+            characters[unicode_value] += 1
            is_letter = True
+        elif lang.unicode_ranges is not None:
+            for start, end in lang.unicode_ranges:
+              if unicode_value >= start and unicode_value <= end:
+                characters[unicode_value] = 1
+                is_letter = True
+                break
        else:
            # We save the character if it is at least in one of the
            # language encodings and its not a special character.
@ -221,16 +254,16 @@ def process_text(content, lang):
                # Not sure if that is a bug or expected.
                codepoint = ord(codepoint)
                if charsets[charset].charmap[codepoint] == LET:
-                    characters[ord(char)] = 1
+                    characters[unicode_value] = 1
                    is_letter = True
                    break
        if is_letter:
            if prev_char is not None:
-                if (prev_char, ord(char)) in sequences:
-                    sequences[(prev_char, ord(char))] += 1
+                if (prev_char, unicode_value) in sequences:
+                    sequences[(prev_char, unicode_value)] += 1
                else:
-                    sequences[(prev_char, ord(char))] = 1
-            prev_char = ord(char)
+                    sequences[(prev_char, unicode_value)] = 1
+            prev_char = unicode_value
        else:
            prev_char = None

@ -329,15 +362,23 @@ accumulated_ratios = 0
 # If there is an alphabet, we make sure all the alphabet characters are in the
 # frequent list, and we stop then. There may therefore be more or less than
 # 64 frequent characters depending on the language.
-if lang.alphabet is None:
+logfd.write('\nMost Frequent characters:')
+if lang.alphabet is None and lang.frequent_ranges is None:
    freq_count = 64
-else:
+    for order, (char, ratio) in enumerate(sorted_ratios):
+        if order >= freq_count:
+            break
+        logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
+        accumulated_ratios += ratio
+elif lang.alphabet is not None:
    freq_count = 0
    for order, (char, ratio) in enumerate(sorted_ratios):
        if len(lang.alphabet) == 0:
            break
        if chr(char) in lang.alphabet:
            lang.alphabet.remove(chr(char))
+        logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
+        accumulated_ratios += ratio
        freq_count += 1
    else:
        if len(lang.alphabet) > 0:
@ -345,13 +386,36 @@ else:
                  "\n       Please check the configuration or the data."
                  "\n       Missing characters: {}".format(", ".join(lang.alphabet)))
            exit(1)
+elif lang.frequent_ranges is not None:
+    freq_count = 0
+    non_freq_counter = 0
+    non_freq_ratio   = 0
+    for order, (char, ratio) in enumerate(sorted_ratios):
+      for start, end in lang.frequent_ranges:
+        if char >= start and char <= end:
+          freq_count += 1
+          non_freq_counter = 0
+          non_freq_ratio   = 0
+          accumulated_ratios += ratio
+          logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
+          break
+      else:
+        if non_freq_counter >= 2:
+          # We don't try to get necessarily the whole range, but break
+          # when we are getting into known non-frequent area.
+          freq_count         -= non_freq_counter
+          accumulated_ratios -= non_freq_ratio
+          break
+        freq_count         += 1
+        accumulated_ratios += ratio

-logfd.write('\nFirst {} characters:'.format(freq_count))
-for order, (char, ratio) in enumerate(sorted_ratios):
-    if order >= freq_count:
+        non_freq_counter   += 1
+        non_freq_ratio     += ratio
+      if accumulated_ratios >= 0.99:
+        if non_freq_counter > 0:
+          freq_count         -= non_freq_counter
+          accumulated_ratios -= non_freq_ratio
        break
-    logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
-    accumulated_ratios += ratio

 logfd.write("\n\nThe first {} characters have an accumulated ratio of {}.\n".format(freq_count, accumulated_ratios))

@ -508,37 +572,66 @@ c_code += CTOM_str

 ratios = {}
 occurrences = sum(sequences.values())
-ratio_512 = 0
-ratio_1024 = 0

+accumulated_seq_count = 0
+order_3 = -1
+order_2 = -1
+ratio_3 = -1
+ratio_2 = -1
+count_512 = -1
+count_1024 = -1
 sorted_seqs = sorted(sequences.items(), key=operator.itemgetter(1),
                     reverse=True)
 for order, ((c1, c2), count) in enumerate(sorted_seqs):
-    if order < 512:
-        ratio_512 += count
-    elif order < 1024:
-        ratio_1024 += count
-    else:
-        break
-ratio_512 /= occurrences
-ratio_1024 /= occurrences
+  accumulated_seq_count += count
+  if order_3 == -1 and accumulated_seq_count / occurrences >= 0.995:
+    order_3 = order
+    ratio_3 = accumulated_seq_count / occurrences
+  elif order_2 == -1 and accumulated_seq_count / occurrences >= 0.999:
+    order_2 = order
+    ratio_2 = accumulated_seq_count / occurrences
+  if order < 512:
+    count_512 += count
+  elif order < 1024:
+    count_1024 += count
+
+  if order_3 != -1 and order_2 != -1:
+    break
+
+if order_3 == -1 or order_2 == -1:
+  # This would probably never happens. It would require a language with
+  # very few possible sequences and each of the sequences are widely
+  # used. Just add this code for completio, but it won't likely ever be
+  # run.
+  order_2 = 512
+  order_3 = 1024
+  ratio_2 = count_512 / occurrences
+  ratio_3 = count_1024 / occurrences

 logfd.write("\n{} sequences found.\n".format(len(sorted_seqs)))

 c_code += """
 /* Model Table:
- * Total sequences: {}
- * First 512 sequences: {}
- * Next 512 sequences (512-1024): {}
- * Rest: {}
+ * Total considered sequences: {} / {}
+ * - Positive sequences: first {} ({})
+ * - Probable sequences: next {} ({}-{}) ({})
+ * - Neutral sequences: last {} ({})
+ * - Negative sequences: {} (off-ratio)
 * Negative sequences: TODO""".format(len(sorted_seqs),
-                                      ratio_512,
-                                      ratio_1024,
-                                      1 - ratio_512 - ratio_1024)
+                                      freq_count * freq_count,
+                                      order_3, ratio_3,
+                                      order_2 - order_3,
+                                      order_2, order_3,
+                                      ratio_2 - ratio_3,
+                                      freq_count * freq_count - order_2,
+                                      1 - ratio_2,
+                                      freq_count * freq_count - len(sorted_seqs))

-logfd.write("\nFirst 512 (typical positive ratio): {}".format(ratio_512))
-logfd.write("\nNext 512 (512-1024): {}".format(ratio))
-logfd.write("\nRest: {}".format(1 - ratio_512 - ratio_1024))
+logfd.write("\nFirst {} (typical positive ratio): {}".format(order_3, ratio_3))
+logfd.write("\nNext {} ({}-{}): {}".format(order_2 - order_3,
+                                           order_2, order_3,
+                                           ratio_2 - ratio_3))
+logfd.write("\nRest: {}".format(1 - ratio_2))

 c_code += "\n */\n"

@ -558,9 +651,9 @@ for line in range(0, freq_count):
            if (first_char, second_char) in sequences:
                for order, (seq, _) in enumerate(sorted_seqs):
                    if seq == (first_char, second_char):
-                        if order < 512:
+                        if order < order_3:
                            LM_str += '3,'
-                        elif order < 1024:
+                        elif order < order_2:
                            LM_str += '2,'
                        else:
                            LM_str += '1,'
@ -583,7 +676,7 @@ for charset in charsets:
    SM_str += '\n{\n  '
    SM_str += '{}_CharToOrderMap,\n  {}LangModel,'.format(charset_c, language_c)
    SM_str += '\n  {},'.format(freq_count)
-    SM_str += '\n  (float){},'.format(ratio_512)
+    SM_str += '\n  (float){},'.format(ratio_2)
    SM_str += '\n  {},'.format('PR_TRUE' if lang.use_ascii else 'PR_FALSE')
    SM_str += '\n  "{}",'.format(charset)
    SM_str += '\n  "{}"'.format(lang.code)
@ -597,7 +690,7 @@ SM_str += '\n  Unicode_CharOrder,'
 SM_str += '\n  {},'.format(len(sorted_chars)) # Order is wrong!
 SM_str += '\n  {}LangModel,'.format(language_c)
 SM_str += '\n  {},'.format(freq_count)
-SM_str += '\n  (float){},'.format(ratio_512)
+SM_str += '\n  (float){},'.format(accumulated_ratios)
 SM_str += '\n};'
 c_code += SM_str

--- a/script/BuildLangModelLogs/LangKoreanModel.log
+++ b/script/BuildLangModelLogs/LangKoreanModel.log
@ -0,0 +1,701 @@
+= Logs of language model for Korean (ko) =
+
+- Generated by BuildLangModel.py
+- Started: 2021-03-18 21:59:02.887978
+- Maximum depth: 4
+- Max number of pages: 100
+
+== Parsed pages ==
+
+칼리스토_(위성) (revision 28961393)
+1610년 (revision 28556414)
+1월 7일 (revision 28876047)
+2003년 12월 (revision 23976672)
+2009년 목성 충돌사건 (revision 27323048)
+2010년 목성 충돌사건 (revision 27323048)
+2차 충돌구 (revision 28946595)
+D형 소행성 (revision 28394092)
+GND (식별자) (revision 28475536)
+HD 209458 b (revision 28560349)
+H 콘드라이트 (revision 28394899)
+LCCN (식별자) (revision 19573947)
+S/2003 J 10 (revision 25753708)
+S/2003 J 12 (revision 26400377)
+S/2003 J 16 (revision 28763186)
+S/2003 J 2 (revision 25753708)
+S/2003 J 23 (revision 25753708)
+S/2003 J 4 (revision 25753708)
+S/2003 J 9 (revision 26400377)
+S/2015 (136472) 1 (revision 25714103)
+SMARA (revision 24063296)
+VIAF (식별자) (revision 27137907)
+WorldCat Identities (식별자) (revision 27521768)
+가니메데 (위성) (revision 28631891)
+가속도 (revision 28183524)
+갈릴레오 (우주선) (revision 27368796)
+갈릴레오 갈릴레이 (revision 27826096)
+갈릴레이 위성 (revision 28669230)
+강착 (revision 26422060)
+개주기 함수 (revision 28695336)
+겉보기등급 (revision 28040769)
+고물 카테나 (revision 23226141)
+고전적 카이퍼대 천체 (revision 26313323)
+공전 주기 (revision 22619799)
+관성 모멘트 (revision 28070982)
+구조적 융기 (revision 27745126)
+궤도 경사 (revision 25327996)
+궤도 공명 (revision 27567384)
+궤도 이심률 (revision 26995575)
+규산염 (revision 22359319)
+규소 (revision 28601546)
+규칙 위성 (revision 20613029)
+그램 (revision 26373186)
+그리스 신화 (revision 28186023)
+그리스어 (revision 28615929)
+근적외선 (revision 27650322)
+근지구 소행성 (revision 28435841)
+금성 (revision 28157143)
+금성의 대기 (revision 28434028)
+기반암 (revision 25928764)
+기압 (revision 27417925)
+긴반지름 (revision 22347233)
+네레이드 (위성) (revision 26499827)
+녹는점 (revision 23970478)
+뇨르드 (충돌구) (revision 26331003)
+뉴 허라이즌스 (revision 28462690)
+뉴욕 타임스 (revision 27647761)
+능선 (revision 27745126)
+다모클레스군 (revision 25456157)
+단층애 (revision 19923354)
+달 (revision 28918055)
+달의 대기 (revision 28533295)
+대기권 (revision 28369550)
+대기압 (revision 27645730)
+대류 (revision 27653974)
+대적점 (revision 28897867)
+도 (각도) (revision 26551438)
+도 (충돌구) (revision 25843212)
+디스노미아 (위성) (revision 28921686)
+디아 (위성) (revision 26367850)
+디오네 (위성) (revision 28934552)
+디지털 객체 식별자 (revision 27657635)
+라플라스-P (revision 28591454)
+레다 (위성) (revision 26682939)
+레아 (위성) (revision 28593029)
+로픈 (충돌구) (revision 27645834)
+리시테아 (위성) (revision 26682938)
+림 (충돌구) (revision 27645834)
+마그네슘 (revision 27661919)
+마이크로미터 (revision 28705367)
+마케마케 (왜행성) (revision 26405503)
+맨틀 (revision 28942503)
+메가클리테 (revision 28907421)
+메티스 (위성) (revision 23978022)
+명왕성 (revision 28921751)
+명왕성의 위성 (revision 28903818)
+명왕성족 (revision 27553778)
+명왕성형 천체 (revision 27268668)
+목성 (revision 28958627)
+목성 LI (revision 28891035)
+목성 LII (revision 28903781)
+목성 LIV (revision 28903781)
+목성 LIX (revision 28903781)
+목성 LV (revision 28903781)
+목성 LVI (revision 28903781)
+목성 LXI (revision 28903781)
+목성 LXIII (revision 28903781)
+목성 LXIV (revision 28903781)
+목성 LXIX (revision 28903781)
+목성 LXVI (revision 28903781)
+
+== End of Parsed pages ==
+
+- Wikipedia parsing ended at: 2021-03-18 22:02:04.863862
+
+1048 characters appeared 222213 times.
+
+Most Frequent characters:
+[ 0] Char 이: 3.8089580717599785 %
+[ 1] Char 다: 3.088028153168356 %
+[ 2] Char 의: 2.860318703226184 %
+[ 3] Char 성: 2.408049934072264 %
+[ 4] Char 는: 2.3108459001048542 %
+[ 5] Char 에: 2.287444928964552 %
+[ 6] Char 로: 1.8288758983497817 %
+[ 7] Char 은: 1.5381638337991026 %
+[ 8] Char 하: 1.5287134416078267 %
+[ 9] Char 가: 1.5017123210613241 %
+[10] Char 지: 1.470211013757071 %
+[11] Char 을: 1.3694068303834608 %
+[12] Char 기: 1.2281009661900968 %
+[13] Char 도: 1.188499322721893 %
+[14] Char 고: 1.1835491172883674 %
+[15] Char 한: 1.1565479967418648 %
+[16] Char 위: 1.079144784508557 %
+[17] Char 서: 1.0746445977508066 %
+[18] Char 스: 0.9832908065684727 %
+[19] Char 으: 0.9607898727797203 %
+[20] Char 리: 0.9256884160692669 %
+[21] Char 어: 0.8482852038359592 %
+[22] Char 대: 0.8446850544297588 %
+[23] Char 들: 0.8118336910981806 %
+[24] Char 있: 0.7902327946609784 %
+[25] Char 사: 0.7560313753020751 %
+[26] Char 를: 0.7555813566263 %
+[27] Char 과: 0.7078793769941453 %
+[28] Char 아: 0.6979789661270943 %
+[29] Char 되: 0.6610774347135406 %
+[30] Char 전: 0.6480268931160643 %
+[31] Char 일: 0.6475768744402892 %
+[32] Char 년: 0.6187756791906864 %
+[33] Char 시: 0.6079752309720853 %
+[34] Char 인: 0.5908745212926336 %
+[35] Char 그: 0.5872743718864333 %
+[36] Char 해: 0.5872743718864333 %
+[37] Char 목: 0.5661234941250062 %
+[38] Char 수: 0.5656734754492312 %
+[39] Char 라: 0.5616233073672557 %
+[40] Char 적: 0.5449726163635791 %
+[41] Char 나: 0.5382223362269534 %
+[42] Char 구: 0.533272130793428 %
+[43] Char 주: 0.5251717946294772 %
+[44] Char 자: 0.523821738602152 %
+[45] Char 정: 0.5080710849500254 %
+[46] Char 부: 0.49367048732522395 %
+[47] Char 었: 0.4792698897004226 %
+[48] Char 소: 0.44236835828686893 %
+[49] Char 보: 0.43606809682601827 %
+[50] Char 우: 0.43291796609559297 %
+[51] Char 레: 0.432467947419818 %
+[52] Char 발: 0.4243676112558671 %
+[53] Char 행: 0.4243676112558671 %
+[54] Char 여: 0.4054668268733152 %
+[55] Char 며: 0.4050168081975402 %
+[56] Char 와: 0.4036667521702151 %
+[57] Char 면: 0.39556641600626424 %
+[58] Char 명: 0.3802657810299127 %
+[59] Char 된: 0.37801568765103755 %
+[60] Char 화: 0.3775656689752625 %
+[61] Char 만: 0.36721523943243645 %
+[62] Char 중: 0.3667652207566614 %
+[63] Char 상: 0.35146458578030987 %
+[64] Char 공: 0.3478644363741095 %
+[65] Char 것: 0.34516432431945926 %
+[66] Char 오: 0.342464212264809 %
+[67] Char 원: 0.3366139694797334 %
+[68] Char 궤: 0.33391385742508317 %
+[69] Char 마: 0.32941367066733274 %
+[70] Char 문: 0.3285136333157826 %
+[71] Char 계: 0.32581352126113233 %
+[72] Char 유: 0.32176335317915694 %
+[73] Char 제: 0.3190632411245067 %
+[74] Char 영: 0.31771318509718155 %
+[75] Char 장: 0.31546309171830633 %
+[76] Char 양: 0.3064627182028054 %
+[77] Char 국: 0.3060126995270304 %
+[78] Char 게: 0.3051126621754803 %
+[79] Char 관: 0.3042126248239302 %
+[80] Char 니: 0.3033125874723801 %
+[81] Char 였: 0.2992624193904047 %
+[82] Char 데: 0.2965623073357544 %
+[83] Char 경: 0.29431221395687923 %
+[84] Char 체: 0.29296215792955405 %
+[85] Char 분: 0.292512139253779 %
+[86] Char 동: 0.29161210190222897 %
+[87] Char 반: 0.28081165368362787 %
+[88] Char 름: 0.27721150427742747 %
+[89] Char 신: 0.27721150427742747 %
+[90] Char 진: 0.27046122414080187 %
+[91] Char 비: 0.2695611867892517 %
+[92] Char 세: 0.2664110560588265 %
+[93] Char 갈: 0.2659610373830514 %
+[94] Char 태: 0.2632609253284011 %
+[95] Char 음: 0.2610108319495259 %
+[96] Char 했: 0.2556106078402254 %
+[97] Char 월: 0.2556106078402254 %
+[98] Char 메: 0.25201045843402503 %
+[99] Char 작: 0.25111042108247494 %
+[100] Char 형: 0.24931034637937474 %
+[101] Char 견: 0.24841030902782463 %
+[102] Char 릴: 0.24796029035204964 %
+[103] Char 모: 0.24661023432472448 %
+[104] Char 학: 0.24526017829739932 %
+[105] Char 선: 0.24481015962162428 %
+[106] Char 용: 0.2439101222700742 %
+[107] Char 러: 0.24211004756697405 %
+[108] Char 각: 0.241660028891199 %
+[109] Char 천: 0.2385098981607737 %
+[110] Char 크: 0.2358097861061234 %
+[111] Char 미: 0.23535976743034837 %
+[112] Char 연: 0.23310967405147315 %
+[113] Char 점: 0.2308595806725979 %
+[114] Char 트: 0.22815946861794767 %
+[115] Char 토: 0.22590937523907242 %
+[116] Char 물: 0.22590937523907242 %
+[117] Char 역: 0.22590937523907242 %
+[118] Char 때: 0.2236592818601972 %
+[119] Char 거: 0.22230922583287208 %
+[120] Char 표: 0.22095916980554692 %
+[121] Char 간: 0.22005913245399683 %
+[122] Char 터: 0.21915909510244677 %
+[123] Char 돌: 0.2178090390751216 %
+[124] Char 질: 0.21735902039934657 %
+[125] Char 르: 0.21510892702047132 %
+[126] Char 파: 0.21420888966892127 %
+[127] Char 재: 0.21375887099314622 %
+[128] Char 운: 0.2124088149658211 %
+[129] Char 개: 0.21015872158694585 %
+[130] Char 군: 0.2043084788018703 %
+[131] Char 바: 0.20025831071989486 %
+[132] Char 까: 0.19890825469256976 %
+[133] Char 산: 0.19215797455594408 %
+[134] Char 측: 0.191707955880169 %
+[135] Char 속: 0.191707955880169 %
+[136] Char 않: 0.191257937204394 %
+[137] Char 달: 0.1845076570677683 %
+[138] Char 조: 0.18180754501311805 %
+[139] Char 충: 0.181357526337343 %
+[140] Char 추: 0.17820739560691767 %
+[141] Char 두: 0.17820739560691767 %
+[142] Char 드: 0.1764073209038175 %
+[143] Char 또: 0.17460724620071733 %
+[144] Char 생: 0.17370720884916724 %
+[145] Char 통: 0.17055707811874193 %
+[146] Char 왕: 0.16875700341564176 %
+[147] Char 방: 0.16695692871254159 %
+[148] Char 함: 0.16470683533366637 %
+[149] Char 안: 0.1633567793063412 %
+[150] Char 합: 0.15930661122436582 %
+[151] Char 불: 0.15930661122436582 %
+[152] Char 히: 0.1584065738728157 %
+[153] Char 내: 0.15660649916971553 %
+[154] Char 후: 0.15570646181816544 %
+[155] Char 타: 0.1530063497635152 %
+[156] Char 약: 0.1521063124119651 %
+[157] Char 른: 0.151206275060415 %
+[158] Char 할: 0.151206275060415 %
+[159] Char 호: 0.15075625638463996 %
+[160] Char 네: 0.15075625638463996 %
+[161] Char 포: 0.1489561816815398 %
+[162] Char 심: 0.1480561443299897 %
+[163] Char 탐: 0.14760612565421466 %
+[164] Char 같: 0.14580605095111449 %
+[165] Char 등: 0.14580605095111449 %
+[166] Char 번: 0.14535603227533944 %
+[167] Char 량: 0.14445599492378933 %
+[168] Char 더: 0.14400597624801428 %
+[169] Char 현: 0.14310593889646422 %
+[170] Char 치: 0.13950578949026385 %
+[171] Char 외: 0.1390557708144888 %
+[172] Char 력: 0.13770571478716367 %
+[173] Char 칼: 0.1359056400840635 %
+[174] Char 따: 0.13275550935363817 %
+[175] Char 금: 0.13185547200208808 %
+[176] Char 실: 0.12645524789278756 %
+[177] Char 카: 0.12645524789278756 %
+[178] Char 차: 0.12510519186546243 %
+[179] Char 단: 0.12240507981081214 %
+[180] Char 프: 0.12240507981081214 %
+[181] Char 테: 0.1219550611350371 %
+[182] Char 규: 0.12150504245926207 %
+[183] Char 결: 0.12150504245926207 %
+[184] Char 독: 0.1179048930530617 %
+[185] Char 존: 0.11610481834996153 %
+[186] Char 변: 0.11475476232263639 %
+[187] Char 매: 0.11475476232263639 %
+[188] Char 루: 0.11430474364686136 %
+[189] Char 무: 0.11115461291643604 %
+[190] Char 석: 0.11070459424066098 %
+[191] Char 져: 0.11070459424066098 %
+[192] Char 근: 0.10935453821333586 %
+[193] Char 초: 0.10755446351023566 %
+[194] Char 향: 0.10665442615868559 %
+[195] Char 설: 0.10485435145558542 %
+[196] Char 째: 0.10395431410403531 %
+[197] Char 본: 0.10395431410403531 %
+[198] Char 직: 0.09765405264318469 %
+[199] Char 았: 0.09720403396740965 %
+[200] Char 말: 0.09720403396740965 %
+[201] Char 식: 0.0967540152916346 %
+[202] Char 많: 0.09630399661585956 %
+[203] Char 저: 0.09630399661585956 %
+[204] Char 요: 0.0958539779400845 %
+[205] Char 교: 0.0958539779400845 %
+[206] Char 련: 0.09540395926430947 %
+[207] Char 회: 0.09540395926430947 %
+[208] Char 알: 0.09135379118233407 %
+[209] Char 려: 0.09045375383078398 %
+[210] Char 배: 0.09045375383078398 %
+[211] Char 열: 0.0877536417761337 %
+[212] Char 칙: 0.08640358574880858 %
+[213] Char 티: 0.08640358574880858 %
+[214] Char 온: 0.08595356707303353 %
+[215] Char 처: 0.08550354839725849 %
+[216] Char 당: 0.08505352972148343 %
+[217] Char 색: 0.08505352972148343 %
+[218] Char 큰: 0.0846035110457084 %
+[219] Char 임: 0.08415349236993334 %
+[220] Char 예: 0.08325345501838326 %
+[221] Char 권: 0.08280343634260821 %
+[222] Char 순: 0.08280343634260821 %
+[223] Char 없: 0.08190339899105813 %
+[224] Char 던: 0.08010332428795795 %
+[225] Char 케: 0.07965330561218291 %
+[226] Char 항: 0.07920328693640785 %
+[227] Char 최: 0.07875326826063282 %
+[228] Char 강: 0.07830324958485776 %
+[229] Char 률: 0.07830324958485776 %
+[230] Char 망: 0.07830324958485776 %
+[231] Char 론: 0.0765031748817576 %
+[232] Char 쪽: 0.0756031375302075 %
+[233] Char 붙: 0.07470310017865742 %
+[234] Char 평: 0.07470310017865742 %
+[235] Char 확: 0.07425308150288236 %
+[236] Char 얼: 0.07335304415133227 %
+[237] Char 래: 0.07290302547555724 %
+[238] Char 밝: 0.07110295077245705 %
+[239] Char 류: 0.07020291342090697 %
+[240] Char 준: 0.06975289474513192 %
+[241] Char 노: 0.06885285739358184 %
+[242] Char 능: 0.06840283871780678 %
+[243] Char 록: 0.06795282004203175 %
+[244] Char 퍼: 0.06660276401470662 %
+[245] Char 졌: 0.06615274533893156 %
+[246] Char 받: 0.06615274533893156 %
+[247] Char 획: 0.06615274533893156 %
+[248] Char 키: 0.06390265196005634 %
+[249] Char 밀: 0.06390265196005634 %
+[250] Char 겨: 0.06390265196005634 %
+[251] Char 민: 0.06300261460850626 %
+[252] Char 플: 0.06255259593273121 %
+[253] Char 피: 0.06120253990540607 %
+[254] Char 탄: 0.06075252122963103 %
+[255] Char 증: 0.05985248387808094 %
+[256] Char 암: 0.059402465202305896 %
+[257] Char 남: 0.05895244652653085 %
+[258] Char 별: 0.05895244652653085 %
+[259] Char 층: 0.0585024278507558 %
+[260] Char 슘: 0.0585024278507558 %
+[261] Char 베: 0.0585024278507558 %
+[262] Char 쟁: 0.0585024278507558 %
+[263] Char 출: 0.058052409174980765 %
+[264] Char 축: 0.058052409174980765 %
+[265] Char 특: 0.058052409174980765 %
+[266] Char 압: 0.058052409174980765 %
+[267] Char 극: 0.058052409174980765 %
+[268] Char 높: 0.057602390499205715 %
+[269] Char 디: 0.057602390499205715 %
+[270] Char 코: 0.057602390499205715 %
+[271] Char 철: 0.05715237182343068 %
+[272] Char 종: 0.05715237182343068 %
+[273] Char 란: 0.05670235314765563 %
+[274] Char 탈: 0.05625233447188058 %
+[275] Char 격: 0.05535229712033049 %
+[276] Char 입: 0.05490227844455545 %
+[277] Char 접: 0.0544522597687804 %
+[278] Char 법: 0.0544522597687804 %
+[279] Char 령: 0.05400224109300536 %
+[280] Char 및: 0.053552222417230316 %
+[281] Char 야: 0.05265218506568022 %
+[282] Char 광: 0.052202166389905186 %
+[283] Char 난: 0.051752147714130135 %
+[284] Char 감: 0.051302129038355085 %
+[285] Char 판: 0.05085211036258005 %
+[286] Char 승: 0.05085211036258005 %
+[287] Char 새: 0.05085211036258005 %
+[288] Char 랑: 0.04995207301102996 %
+[289] Char 급: 0.048602016983704824 %
+[290] Char 건: 0.04815199830792978 %
+[291] Char 페: 0.04815199830792978 %
+[292] Char 투: 0.04725196095637969 %
+[293] Char 립: 0.04680194228060464 %
+[294] Char 폭: 0.0463519236048296 %
+[295] Char 황: 0.045901904929054556 %
+[296] Char 머: 0.045901904929054556 %
+[297] Char 버: 0.04545188625327951 %
+[298] Char 핵: 0.04500186757750446 %
+[299] Char 든: 0.04500186757750446 %
+[300] Char 틀: 0.044101830225954375 %
+[301] Char 몇: 0.04320179287440429 %
+[302] Char 날: 0.0423017555228542 %
+[303] Char 족: 0.04185173684707916 %
+[304] Char 럽: 0.04185173684707916 %
+[305] Char 언: 0.04185173684707916 %
+[306] Char 칭: 0.04185173684707916 %
+[307] Char 풍: 0.04140171817130411 %
+[308] Char 착: 0.04140171817130411 %
+[309] Char 럼: 0.04140171817130411 %
+[310] Char 균: 0.04140171817130411 %
+[311] Char 복: 0.04140171817130411 %
+[312] Char 집: 0.04140171817130411 %
+[313] Char 너: 0.04095169949552906 %
+[314] Char 움: 0.04095169949552906 %
+[315] Char 낮: 0.04050168081975402 %
+[316] Char 린: 0.04050168081975402 %
+[317] Char 람: 0.04050168081975402 %
+[318] Char 술: 0.040051662143978976 %
+[319] Char 허: 0.040051662143978976 %
+[320] Char 슷: 0.039601643468203926 %
+[321] Char 응: 0.0382515874408788 %
+[322] Char 찰: 0.03780156876510375 %
+[323] Char 클: 0.03780156876510375 %
+[324] Char 될: 0.03735155008932871 %
+[325] Char 백: 0.03645151273777862 %
+[326] Char 뉴: 0.03465143803467843 %
+[327] Char 완: 0.03420141935890339 %
+[328] Char 료: 0.033751400683128346 %
+[329] Char 쓰: 0.033751400683128346 %
+[330] Char 긴: 0.03330138200735331 %
+[331] Char 편: 0.03330138200735331 %
+[332] Char 떨: 0.03285136333157826 %
+[333] Char 맨: 0.03285136333157826 %
+[334] Char 첫: 0.03285136333157826 %
+[335] Char 침: 0.03285136333157826 %
+[336] Char 폴: 0.032401344655803216 %
+[337] Char 왜: 0.032401344655803216 %
+[338] Char 활: 0.03195132598002817 %
+[339] Char 험: 0.03195132598002817 %
+[340] Char 율: 0.03150130730425313 %
+[341] Char 멘: 0.03150130730425313 %
+[342] Char 습: 0.03150130730425313 %
+[343] Char 늘: 0.03105128862847808 %
+[344] Char 얻: 0.030601269952703035 %
+[345] Char 환: 0.03015125127692799 %
+[346] Char 울: 0.03015125127692799 %
+[347] Char 깥: 0.03015125127692799 %
+[348] Char 곳: 0.03015125127692799 %
+[349] Char 북: 0.03015125127692799 %
+[350] Char 왔: 0.029701232601152948 %
+[351] Char 났: 0.029701232601152948 %
+[352] Char 맹: 0.029701232601152948 %
+[353] Char 염: 0.0292512139253779 %
+[354] Char 먼: 0.0292512139253779 %
+[355] Char 느: 0.0292512139253779 %
+[356] Char 혜: 0.0292512139253779 %
+[357] Char 킬: 0.0292512139253779 %
+[358] Char 절: 0.0292512139253779 %
+[359] Char 볼: 0.0292512139253779 %
+[360] Char 줄: 0.0292512139253779 %
+[361] Char 헤: 0.0292512139253779 %
+[362] Char 필: 0.028351176573827814 %
+[363] Char 센: 0.02790115789805277 %
+[364] Char 값: 0.026551101870727636 %
+[365] Char 품: 0.026551101870727636 %
+[366] Char 참: 0.026101083194952593 %
+[367] Char 륙: 0.026101083194952593 %
+[368] Char 잡: 0.025651064519177542 %
+[369] Char 링: 0.025651064519177542 %
+[370] Char 께: 0.0252010458434025 %
+[371] Char 킨: 0.0252010458434025 %
+[372] Char 흔: 0.024301008491852412 %
+[373] Char 몬: 0.02385098981607737 %
+[374] Char 못: 0.02385098981607737 %
+[375] Char 촬: 0.02385098981607737 %
+[376] Char 막: 0.02385098981607737 %
+[377] Char 쳐: 0.02385098981607737 %
+[378] Char 찾: 0.02340097114030232 %
+[379] Char 웨: 0.02340097114030232 %
+[380] Char 슬: 0.022950952464527278 %
+[381] Char 둘: 0.022950952464527278 %
+[382] Char 징: 0.022950952464527278 %
+[383] Char 례: 0.022950952464527278 %
+[384] Char 올: 0.022950952464527278 %
+[385] Char 살: 0.022950952464527278 %
+[386] Char 즌: 0.02250093378875223 %
+[387] Char 브: 0.02250093378875223 %
+[388] Char 션: 0.02250093378875223 %
+[389] Char 즈: 0.021600896437202144 %
+[390] Char 런: 0.021600896437202144 %
+[391] Char 쿠: 0.021600896437202144 %
+[392] Char 헌: 0.0211508777614271 %
+[393] Char 곱: 0.0211508777614271 %
+[394] Char 웅: 0.0211508777614271 %
+[395] Char 헬: 0.0211508777614271 %
+[396] Char 밖: 0.020700859085652053 %
+[397] Char 멀: 0.020700859085652053 %
+[398] Char 혀: 0.020700859085652053 %
+[399] Char 빠: 0.020700859085652053 %
+[400] Char 범: 0.020700859085652053 %
+[401] Char 므: 0.020700859085652053 %
+[402] Char 힘: 0.020700859085652053 %
+[403] Char 넘: 0.02025084040987701 %
+[404] Char 워: 0.02025084040987701 %
+[405] Char 커: 0.02025084040987701 %
+[406] Char 팀: 0.02025084040987701 %
+[407] Char 뮬: 0.02025084040987701 %
+[408] Char 냈: 0.019800821734101963 %
+[409] Char 총: 0.019800821734101963 %
+[410] Char 손: 0.019800821734101963 %
+[411] Char 갖: 0.019800821734101963 %
+[412] Char 빛: 0.01935080305832692 %
+[413] Char 액: 0.01935080305832692 %
+[414] Char 창: 0.01935080305832692 %
+[415] Char 논: 0.01935080305832692 %
+[416] Char 낸: 0.018900784382551876 %
+[417] Char 즉: 0.018900784382551876 %
+[418] Char 억: 0.018900784382551876 %
+[419] Char 청: 0.018900784382551876 %
+[420] Char 혹: 0.018450765706776832 %
+[421] Char 블: 0.018450765706776832 %
+[422] Char 책: 0.018450765706776832 %
+[423] Char 찬: 0.018450765706776832 %
+[424] Char 곡: 0.018000747031001785 %
+[425] Char 누: 0.018000747031001785 %
+[426] Char 패: 0.018000747031001785 %
+[427] Char 잘: 0.018000747031001785 %
+[428] Char 림: 0.017550728355226742 %
+[429] Char 검: 0.017550728355226742 %
+[430] Char 채: 0.017550728355226742 %
+[431] Char 녹: 0.017100709679451695 %
+[432] Char 괴: 0.017100709679451695 %
+[433] Char 십: 0.017100709679451695 %
+[434] Char 글: 0.017100709679451695 %
+[435] Char 빨: 0.017100709679451695 %
+[436] Char 융: 0.016650691003676655 %
+[437] Char 렸: 0.016650691003676655 %
+[438] Char 길: 0.016650691003676655 %
+[439] Char 삼: 0.016650691003676655 %
+[440] Char 협: 0.016650691003676655 %
+[441] Char 잃: 0.016650691003676655 %
+[442] Char 병: 0.016650691003676655 %
+[443] Char 옅: 0.016200672327901608 %
+[444] Char 념: 0.015750653652126564 %
+[445] Char 뜻: 0.015750653652126564 %
+[446] Char 켜: 0.015300634976351517 %
+[447] Char 걸: 0.015300634976351517 %
+[448] Char 효: 0.015300634976351517 %
+[449] Char 육: 0.015300634976351517 %
+[450] Char 벨: 0.015300634976351517 %
+[451] Char 업: 0.015300634976351517 %
+[452] Char 숫: 0.014850616300576474 %
+[453] Char 틴: 0.014850616300576474 %
+[454] Char 잔: 0.014850616300576474 %
+[455] Char 뒤: 0.014850616300576474 %
+[456] Char 벽: 0.014400597624801429 %
+[457] Char 벌: 0.014400597624801429 %
+[458] Char 짧: 0.014400597624801429 %
+[459] Char 륨: 0.014400597624801429 %
+[460] Char 친: 0.013950578949026385 %
+[461] Char 섭: 0.01350056027325134 %
+[462] Char 톤: 0.01350056027325134 %
+[463] Char 끌: 0.01350056027325134 %
+[464] Char 애: 0.013050541597476296 %
+[465] Char 눈: 0.013050541597476296 %
+[466] Char 담: 0.013050541597476296 %
+[467] Char 캐: 0.013050541597476296 %
+[468] Char 끝: 0.013050541597476296 %
+[469] Char 턴: 0.01260052292170125 %
+[470] Char 희: 0.01260052292170125 %
+[471] Char 략: 0.01260052292170125 %
+[472] Char 떤: 0.01260052292170125 %
+[473] Char 깊: 0.012150504245926206 %
+[474] Char 켰: 0.012150504245926206 %
+[475] Char 렇: 0.012150504245926206 %
+[476] Char 흡: 0.012150504245926206 %
+[477] Char 겼: 0.012150504245926206 %
+[478] Char 슈: 0.012150504245926206 %
+[479] Char 빈: 0.012150504245926206 %
+[480] Char 곽: 0.012150504245926206 %
+[481] Char 앙: 0.012150504245926206 %
+[482] Char 악: 0.012150504245926206 %
+[483] Char 택: 0.012150504245926206 %
+[484] Char 취: 0.012150504245926206 %
+[485] Char 늄: 0.012150504245926206 %
+[486] Char 찌: 0.01170048557015116 %
+[487] Char 박: 0.01170048557015116 %
+[488] Char 맞: 0.01170048557015116 %
+[489] Char 앞: 0.01170048557015116 %
+[490] Char 톨: 0.01170048557015116 %
+[491] Char 렵: 0.01170048557015116 %
+[492] Char 덮: 0.011250466894376115 %
+[493] Char 펙: 0.011250466894376115 %
+[494] Char 묘: 0.011250466894376115 %
+[495] Char 쌍: 0.011250466894376115 %
+[496] Char 덕: 0.010800448218601072 %
+[497] Char 켈: 0.010800448218601072 %
+[498] Char 엔: 0.010800448218601072 %
+[499] Char 델: 0.010800448218601072 %
+[500] Char 핀: 0.010800448218601072 %
+[501] Char 힌: 0.010800448218601072 %
+[502] Char 섬: 0.010800448218601072 %
+[503] Char 씨: 0.010800448218601072 %
+[504] Char 퇴: 0.010350429542826027 %
+[505] Char 렌: 0.010350429542826027 %
+[506] Char 웹: 0.010350429542826027 %
+[507] Char 텐: 0.010350429542826027 %
+[508] Char 섯: 0.010350429542826027 %
+[509] Char 흑: 0.010350429542826027 %
+[510] Char 큼: 0.009900410867050981 %
+[511] Char 혼: 0.009900410867050981 %
+[512] Char 써: 0.009900410867050981 %
+[513] Char 슨: 0.009900410867050981 %
+[514] Char 송: 0.009900410867050981 %
+[515] Char 좌: 0.009900410867050981 %
+[516] Char 덜: 0.009900410867050981 %
+[517] Char 뀌: 0.009900410867050981 %
+[518] Char 엘: 0.009900410867050981 %
+[519] Char 텔: 0.009900410867050981 %
+[520] Char 쪼: 0.009450392191275938 %
+[521] Char 락: 0.009450392191275938 %
+[522] Char 겉: 0.009450392191275938 %
+[523] Char 렀: 0.009450392191275938 %
+[524] Char 욕: 0.009450392191275938 %
+[525] Char 힐: 0.009450392191275938 %
+[526] Char 떠: 0.009000373515500893 %
+[527] Char 널: 0.009000373515500893 %
+[528] Char 콘: 0.009000373515500893 %
+[529] Char 램: 0.009000373515500893 %
+[530] Char 엄: 0.009000373515500893 %
+[531] Char 룬: 0.009000373515500893 %
+[532] Char 딸: 0.009000373515500893 %
+[533] Char 벼: 0.009000373515500893 %
+[534] Char 윙: 0.009000373515500893 %
+[535] Char 휘: 0.008550354839725847 %
+[536] Char 밤: 0.008550354839725847 %
+[537] Char 뿐: 0.008550354839725847 %
+[538] Char 곧: 0.008550354839725847 %
+[539] Char 훨: 0.008550354839725847 %
+[540] Char 씬: 0.008550354839725847 %
+[541] Char 큐: 0.008550354839725847 %
+[542] Char 숭: 0.008550354839725847 %
+[543] Char 띄: 0.008100336163950804 %
+[544] Char 닌: 0.008100336163950804 %
+[545] Char 깝: 0.008100336163950804 %
+[546] Char 흐: 0.008100336163950804 %
+[547] Char 웠: 0.008100336163950804 %
+[548] Char 롯: 0.008100336163950804 %
+[549] Char 뜨: 0.008100336163950804 %
+[550] Char 죽: 0.008100336163950804 %
+[551] Char 즘: 0.008100336163950804 %
+[552] Char 닉: 0.008100336163950804 %
+[553] Char 붕: 0.007650317488175759 %
+[554] Char 욱: 0.007650317488175759 %
+[555] Char 끼: 0.007650317488175759 %
+[556] Char 익: 0.007650317488175759 %
+[557] Char 옛: 0.007650317488175759 %
+[558] Char 붉: 0.007650317488175759 %
+[559] Char 칠: 0.007650317488175759 %
+[560] Char 웰: 0.007650317488175759 %
+[561] Char 컸: 0.007650317488175759 %
+[562] Char 씩: 0.007650317488175759 %
+[563] Char 낙: 0.007650317488175759 %
+[564] Char 녀: 0.007650317488175759 %
+[565] Char 얇: 0.007650317488175759 %
+[566] Char 싸: 0.007200298812400714 %
+[567] Char 꺼: 0.007200298812400714 %
+[568] Char 찍: 0.007200298812400714 %
+[569] Char 랜: 0.007200298812400714 %
+[570] Char 골: 0.007200298812400714 %
+[571] Char 옹: 0.007200298812400714 %
+[572] Char 빌: 0.007200298812400714 %
+[573] Char 칸: 0.007200298812400714 %
+
+The first 574 characters have an accumulated ratio of 0.9900230859580663.
+
+14099 sequences found.
+
+First 13365 (typical positive ratio): 0.995000852514919
+Next 587 (13952-13365): 0.004003410059676082
+Rest: 0.00099573742540493
+
+- Processing end: 2021-03-18 22:02:18.933817
--- a/script/langs/ko.py
+++ b/script/langs/ko.py
@ -0,0 +1,73 @@
+#!/bin/python3
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+#          Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+import re
+
+## Mandatory Properties ##
+
+# The human name for the language, in English.
+name = 'Korean'
+# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
+# or use another catalog as a last resort.
+code = 'ko'
+# ASCII characters are not commonly used in Korean.
+use_ascii = False
+# Only statistics.
+charsets = []
+
+## Optional Properties ##
+
+# characters.
+frequent_ranges = [('가', '힣')] # Hangul Syllables (AC00–D7A3)
+unicode_ranges = [('가', '힣'), # Hangul Syllables (AC00–D7A3)
+                  ('ᄀ', 'ᇿ'), # Hangul Jamo (1100–11FF)
+                  ('ㄱ', 'ㆎ'), #Hangul Compatibility Jamo (3130–318F)
+                  ('ꥠ', 'ꥼ'), # Hangul Jamo Extended-A (A960–A97F)
+                  ('ힰ', 'ퟆ'), # Hangul Jamo Extended-B (D7B0–D7FF)
+                  ('ퟋ', 'ퟻ'), # Hangul Jamo Extended-B (D7B0–D7FF) - second part
+                 ]
+# The start page. Though optional, it is advised to choose one yourself.
+start_pages = ['칼리스토_(위성)']
+# give possibility to select another code for the Wikipedia URL.
+wikipedia_code = code
+# 'a' and 'A' will be considered the same character, and so on.
+# This uses Python algorithm to determine upper/lower-case of a given
+# character.
+case_mapping = True
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -22,6 +22,7 @@ set(
 	LangModels/LangHebrewModel.cpp
 	LangModels/LangIrishModel.cpp
 	LangModels/LangItalianModel.cpp
+	LangModels/LangKoreanModel.cpp
    LangModels/LangLithuanianModel.cpp
    LangModels/LangLatvianModel.cpp
    LangModels/LangMalteseModel.cpp
--- a/src/LangModels/LangKoreanModel.cpp
+++ b/src/LangModels/LangKoreanModel.cpp
--- a/src/nsLanguageDetector.h
+++ b/src/nsLanguageDetector.h
@ -123,6 +123,7 @@ extern const LanguageModel HebrewModel;
 extern const LanguageModel HungarianModel;
 extern const LanguageModel IrishModel;
 extern const LanguageModel ItalianModel;
+extern const LanguageModel KoreanModel;
 extern const LanguageModel LatvianModel;
 extern const LanguageModel LithuanianModel;
 extern const LanguageModel MalteseModel;
--- a/src/nsMBCSGroupProber.cpp
+++ b/src/nsMBCSGroupProber.cpp
@ -118,6 +118,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
      langDetectors[i][j++] = new nsLanguageDetector(&ThaiModel);
      langDetectors[i][j++] = new nsLanguageDetector(&TurkishModel);
      langDetectors[i][j++] = new nsLanguageDetector(&VietnameseModel);
+      langDetectors[i][j++] = new nsLanguageDetector(&KoreanModel);
    }
    else
    {
--- a/src/nsMBCSGroupProber.h
+++ b/src/nsMBCSGroupProber.h
@ -49,7 +49,7 @@
 #include "nsEUCTWProber.h"

 #define NUM_OF_PROBERS    8
-#define NUM_OF_LANGUAGES  27
+#define NUM_OF_LANGUAGES  28

 class nsMBCSGroupProber: public nsCharSetProber {
 public: