script, src: add generic Korean model.

Until now, Korean charsets had its own probers as there are no
single-byte encoding for writing Korean. I now added a Korean model only
for the generic character and sequence statistics.

I also improved the generation script (script/BuildLangModel.py) to
allow for languages without single-byte charset generation and to
provide meaningful statistics even when the language script has a lot of
characters (so we can't have a full sequence combination array, just too
much data). It's not perfect yet. For instance our UTF-8 Korean test
file ends up with confidence of 0.38503, which is low for obvious Korean
text. Still it works (correctly detected, with top confidence compared
to others) and is a first step toward more improvement for detection
confidence.
This commit is contained in:
Jehan 2021-03-18 17:51:22 +01:00
parent 0d152ff430
commit 629bc879f3
8 changed files with 2221 additions and 39 deletions

View File

@ -115,6 +115,10 @@ if not hasattr(lang, 'custom_case_mapping'):
lang.custom_case_mapping = None
if not hasattr(lang, 'alphabet') or lang.alphabet is None:
lang.alphabet = None
if not hasattr(lang, 'unicode_ranges') or lang.unicode_ranges is None:
lang.unicode_ranges = None
if not hasattr(lang, 'frequent_ranges') or lang.frequent_ranges is None:
lang.frequent_ranges = None
def local_lowercase(text, lang):
lowercased = ''
@ -151,6 +155,28 @@ if lang.alphabet is not None:
#alphabet.append(l)
lang.alphabet = list(set(lang.alphabet))
def normalize_codepoint_ranges(input_range):
output_range = []
if input_range is not None:
for start, end in input_range:
# Allow to write down characters rather than unicode values.
if isinstance(start, str):
start = ord(start)
if isinstance(end, str):
end = ord(end)
if not isinstance(start, int) or not isinstance(end, int):
sys.stderr.write("Expected unicode range in char or int: {}-{}.\n".format(start, end))
if start > end:
sys.stderr.write("Wrong unicode range: {}-{}.\n".format(start, end))
else:
output_range += [(start, end)]
if len(output_range) == 0:
output_range = None
return output_range
lang.unicode_ranges = normalize_codepoint_ranges(lang.unicode_ranges)
lang.frequent_ranges = normalize_codepoint_ranges(lang.frequent_ranges)
# Starting processing.
wikipedia.set_lang(lang.wikipedia_code)
@ -187,10 +213,17 @@ def process_text(content, lang):
# In python 3, strings are UTF-8.
# Looping through them return expected characters.
for char in content:
unicode_value = ord(char)
is_letter = False
if ord(char) in characters:
characters[ord(char)] += 1
if unicode_value in characters:
characters[unicode_value] += 1
is_letter = True
elif lang.unicode_ranges is not None:
for start, end in lang.unicode_ranges:
if unicode_value >= start and unicode_value <= end:
characters[unicode_value] = 1
is_letter = True
break
else:
# We save the character if it is at least in one of the
# language encodings and its not a special character.
@ -221,16 +254,16 @@ def process_text(content, lang):
# Not sure if that is a bug or expected.
codepoint = ord(codepoint)
if charsets[charset].charmap[codepoint] == LET:
characters[ord(char)] = 1
characters[unicode_value] = 1
is_letter = True
break
if is_letter:
if prev_char is not None:
if (prev_char, ord(char)) in sequences:
sequences[(prev_char, ord(char))] += 1
if (prev_char, unicode_value) in sequences:
sequences[(prev_char, unicode_value)] += 1
else:
sequences[(prev_char, ord(char))] = 1
prev_char = ord(char)
sequences[(prev_char, unicode_value)] = 1
prev_char = unicode_value
else:
prev_char = None
@ -329,15 +362,23 @@ accumulated_ratios = 0
# If there is an alphabet, we make sure all the alphabet characters are in the
# frequent list, and we stop then. There may therefore be more or less than
# 64 frequent characters depending on the language.
if lang.alphabet is None:
logfd.write('\nMost Frequent characters:')
if lang.alphabet is None and lang.frequent_ranges is None:
freq_count = 64
else:
for order, (char, ratio) in enumerate(sorted_ratios):
if order >= freq_count:
break
logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
accumulated_ratios += ratio
elif lang.alphabet is not None:
freq_count = 0
for order, (char, ratio) in enumerate(sorted_ratios):
if len(lang.alphabet) == 0:
break
if chr(char) in lang.alphabet:
lang.alphabet.remove(chr(char))
logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
accumulated_ratios += ratio
freq_count += 1
else:
if len(lang.alphabet) > 0:
@ -345,13 +386,36 @@ else:
"\n Please check the configuration or the data."
"\n Missing characters: {}".format(", ".join(lang.alphabet)))
exit(1)
elif lang.frequent_ranges is not None:
freq_count = 0
non_freq_counter = 0
non_freq_ratio = 0
for order, (char, ratio) in enumerate(sorted_ratios):
for start, end in lang.frequent_ranges:
if char >= start and char <= end:
freq_count += 1
non_freq_counter = 0
non_freq_ratio = 0
accumulated_ratios += ratio
logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
break
else:
if non_freq_counter >= 2:
# We don't try to get necessarily the whole range, but break
# when we are getting into known non-frequent area.
freq_count -= non_freq_counter
accumulated_ratios -= non_freq_ratio
break
freq_count += 1
accumulated_ratios += ratio
logfd.write('\nFirst {} characters:'.format(freq_count))
for order, (char, ratio) in enumerate(sorted_ratios):
if order >= freq_count:
non_freq_counter += 1
non_freq_ratio += ratio
if accumulated_ratios >= 0.99:
if non_freq_counter > 0:
freq_count -= non_freq_counter
accumulated_ratios -= non_freq_ratio
break
logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
accumulated_ratios += ratio
logfd.write("\n\nThe first {} characters have an accumulated ratio of {}.\n".format(freq_count, accumulated_ratios))
@ -508,37 +572,66 @@ c_code += CTOM_str
ratios = {}
occurrences = sum(sequences.values())
ratio_512 = 0
ratio_1024 = 0
accumulated_seq_count = 0
order_3 = -1
order_2 = -1
ratio_3 = -1
ratio_2 = -1
count_512 = -1
count_1024 = -1
sorted_seqs = sorted(sequences.items(), key=operator.itemgetter(1),
reverse=True)
for order, ((c1, c2), count) in enumerate(sorted_seqs):
if order < 512:
ratio_512 += count
elif order < 1024:
ratio_1024 += count
else:
break
ratio_512 /= occurrences
ratio_1024 /= occurrences
accumulated_seq_count += count
if order_3 == -1 and accumulated_seq_count / occurrences >= 0.995:
order_3 = order
ratio_3 = accumulated_seq_count / occurrences
elif order_2 == -1 and accumulated_seq_count / occurrences >= 0.999:
order_2 = order
ratio_2 = accumulated_seq_count / occurrences
if order < 512:
count_512 += count
elif order < 1024:
count_1024 += count
if order_3 != -1 and order_2 != -1:
break
if order_3 == -1 or order_2 == -1:
# This would probably never happens. It would require a language with
# very few possible sequences and each of the sequences are widely
# used. Just add this code for completio, but it won't likely ever be
# run.
order_2 = 512
order_3 = 1024
ratio_2 = count_512 / occurrences
ratio_3 = count_1024 / occurrences
logfd.write("\n{} sequences found.\n".format(len(sorted_seqs)))
c_code += """
/* Model Table:
* Total sequences: {}
* First 512 sequences: {}
* Next 512 sequences (512-1024): {}
* Rest: {}
* Total considered sequences: {} / {}
* - Positive sequences: first {} ({})
* - Probable sequences: next {} ({}-{}) ({})
* - Neutral sequences: last {} ({})
* - Negative sequences: {} (off-ratio)
* Negative sequences: TODO""".format(len(sorted_seqs),
ratio_512,
ratio_1024,
1 - ratio_512 - ratio_1024)
freq_count * freq_count,
order_3, ratio_3,
order_2 - order_3,
order_2, order_3,
ratio_2 - ratio_3,
freq_count * freq_count - order_2,
1 - ratio_2,
freq_count * freq_count - len(sorted_seqs))
logfd.write("\nFirst 512 (typical positive ratio): {}".format(ratio_512))
logfd.write("\nNext 512 (512-1024): {}".format(ratio))
logfd.write("\nRest: {}".format(1 - ratio_512 - ratio_1024))
logfd.write("\nFirst {} (typical positive ratio): {}".format(order_3, ratio_3))
logfd.write("\nNext {} ({}-{}): {}".format(order_2 - order_3,
order_2, order_3,
ratio_2 - ratio_3))
logfd.write("\nRest: {}".format(1 - ratio_2))
c_code += "\n */\n"
@ -558,9 +651,9 @@ for line in range(0, freq_count):
if (first_char, second_char) in sequences:
for order, (seq, _) in enumerate(sorted_seqs):
if seq == (first_char, second_char):
if order < 512:
if order < order_3:
LM_str += '3,'
elif order < 1024:
elif order < order_2:
LM_str += '2,'
else:
LM_str += '1,'
@ -583,7 +676,7 @@ for charset in charsets:
SM_str += '\n{\n '
SM_str += '{}_CharToOrderMap,\n {}LangModel,'.format(charset_c, language_c)
SM_str += '\n {},'.format(freq_count)
SM_str += '\n (float){},'.format(ratio_512)
SM_str += '\n (float){},'.format(ratio_2)
SM_str += '\n {},'.format('PR_TRUE' if lang.use_ascii else 'PR_FALSE')
SM_str += '\n "{}",'.format(charset)
SM_str += '\n "{}"'.format(lang.code)
@ -597,7 +690,7 @@ SM_str += '\n Unicode_CharOrder,'
SM_str += '\n {},'.format(len(sorted_chars)) # Order is wrong!
SM_str += '\n {}LangModel,'.format(language_c)
SM_str += '\n {},'.format(freq_count)
SM_str += '\n (float){},'.format(ratio_512)
SM_str += '\n (float){},'.format(accumulated_ratios)
SM_str += '\n};'
c_code += SM_str

View File

@ -0,0 +1,701 @@
= Logs of language model for Korean (ko) =
- Generated by BuildLangModel.py
- Started: 2021-03-18 21:59:02.887978
- Maximum depth: 4
- Max number of pages: 100
== Parsed pages ==
칼리스토_(위성) (revision 28961393)
1610년 (revision 28556414)
1월 7일 (revision 28876047)
2003년 12월 (revision 23976672)
2009년 목성 충돌사건 (revision 27323048)
2010년 목성 충돌사건 (revision 27323048)
2차 충돌구 (revision 28946595)
D형 소행성 (revision 28394092)
GND (식별자) (revision 28475536)
HD 209458 b (revision 28560349)
H 콘드라이트 (revision 28394899)
LCCN (식별자) (revision 19573947)
S/2003 J 10 (revision 25753708)
S/2003 J 12 (revision 26400377)
S/2003 J 16 (revision 28763186)
S/2003 J 2 (revision 25753708)
S/2003 J 23 (revision 25753708)
S/2003 J 4 (revision 25753708)
S/2003 J 9 (revision 26400377)
S/2015 (136472) 1 (revision 25714103)
SMARA (revision 24063296)
VIAF (식별자) (revision 27137907)
WorldCat Identities (식별자) (revision 27521768)
가니메데 (위성) (revision 28631891)
가속도 (revision 28183524)
갈릴레오 (우주선) (revision 27368796)
갈릴레오 갈릴레이 (revision 27826096)
갈릴레이 위성 (revision 28669230)
강착 (revision 26422060)
개주기 함수 (revision 28695336)
겉보기등급 (revision 28040769)
고물 카테나 (revision 23226141)
고전적 카이퍼대 천체 (revision 26313323)
공전 주기 (revision 22619799)
관성 모멘트 (revision 28070982)
구조적 융기 (revision 27745126)
궤도 경사 (revision 25327996)
궤도 공명 (revision 27567384)
궤도 이심률 (revision 26995575)
규산염 (revision 22359319)
규소 (revision 28601546)
규칙 위성 (revision 20613029)
그램 (revision 26373186)
그리스 신화 (revision 28186023)
그리스어 (revision 28615929)
근적외선 (revision 27650322)
근지구 소행성 (revision 28435841)
금성 (revision 28157143)
금성의 대기 (revision 28434028)
기반암 (revision 25928764)
기압 (revision 27417925)
긴반지름 (revision 22347233)
네레이드 (위성) (revision 26499827)
녹는점 (revision 23970478)
뇨르드 (충돌구) (revision 26331003)
뉴 허라이즌스 (revision 28462690)
뉴욕 타임스 (revision 27647761)
능선 (revision 27745126)
다모클레스군 (revision 25456157)
단층애 (revision 19923354)
달 (revision 28918055)
달의 대기 (revision 28533295)
대기권 (revision 28369550)
대기압 (revision 27645730)
대류 (revision 27653974)
대적점 (revision 28897867)
도 (각도) (revision 26551438)
도 (충돌구) (revision 25843212)
디스노미아 (위성) (revision 28921686)
디아 (위성) (revision 26367850)
디오네 (위성) (revision 28934552)
디지털 객체 식별자 (revision 27657635)
라플라스-P (revision 28591454)
레다 (위성) (revision 26682939)
레아 (위성) (revision 28593029)
로픈 (충돌구) (revision 27645834)
리시테아 (위성) (revision 26682938)
림 (충돌구) (revision 27645834)
마그네슘 (revision 27661919)
마이크로미터 (revision 28705367)
마케마케 (왜행성) (revision 26405503)
맨틀 (revision 28942503)
메가클리테 (revision 28907421)
메티스 (위성) (revision 23978022)
명왕성 (revision 28921751)
명왕성의 위성 (revision 28903818)
명왕성족 (revision 27553778)
명왕성형 천체 (revision 27268668)
목성 (revision 28958627)
목성 LI (revision 28891035)
목성 LII (revision 28903781)
목성 LIV (revision 28903781)
목성 LIX (revision 28903781)
목성 LV (revision 28903781)
목성 LVI (revision 28903781)
목성 LXI (revision 28903781)
목성 LXIII (revision 28903781)
목성 LXIV (revision 28903781)
목성 LXIX (revision 28903781)
목성 LXVI (revision 28903781)
== End of Parsed pages ==
- Wikipedia parsing ended at: 2021-03-18 22:02:04.863862
1048 characters appeared 222213 times.
Most Frequent characters:
[ 0] Char 이: 3.8089580717599785 %
[ 1] Char 다: 3.088028153168356 %
[ 2] Char 의: 2.860318703226184 %
[ 3] Char 성: 2.408049934072264 %
[ 4] Char 는: 2.3108459001048542 %
[ 5] Char 에: 2.287444928964552 %
[ 6] Char 로: 1.8288758983497817 %
[ 7] Char 은: 1.5381638337991026 %
[ 8] Char 하: 1.5287134416078267 %
[ 9] Char 가: 1.5017123210613241 %
[10] Char 지: 1.470211013757071 %
[11] Char 을: 1.3694068303834608 %
[12] Char 기: 1.2281009661900968 %
[13] Char 도: 1.188499322721893 %
[14] Char 고: 1.1835491172883674 %
[15] Char 한: 1.1565479967418648 %
[16] Char 위: 1.079144784508557 %
[17] Char 서: 1.0746445977508066 %
[18] Char 스: 0.9832908065684727 %
[19] Char 으: 0.9607898727797203 %
[20] Char 리: 0.9256884160692669 %
[21] Char 어: 0.8482852038359592 %
[22] Char 대: 0.8446850544297588 %
[23] Char 들: 0.8118336910981806 %
[24] Char 있: 0.7902327946609784 %
[25] Char 사: 0.7560313753020751 %
[26] Char 를: 0.7555813566263 %
[27] Char 과: 0.7078793769941453 %
[28] Char 아: 0.6979789661270943 %
[29] Char 되: 0.6610774347135406 %
[30] Char 전: 0.6480268931160643 %
[31] Char 일: 0.6475768744402892 %
[32] Char 년: 0.6187756791906864 %
[33] Char 시: 0.6079752309720853 %
[34] Char 인: 0.5908745212926336 %
[35] Char 그: 0.5872743718864333 %
[36] Char 해: 0.5872743718864333 %
[37] Char 목: 0.5661234941250062 %
[38] Char 수: 0.5656734754492312 %
[39] Char 라: 0.5616233073672557 %
[40] Char 적: 0.5449726163635791 %
[41] Char 나: 0.5382223362269534 %
[42] Char 구: 0.533272130793428 %
[43] Char 주: 0.5251717946294772 %
[44] Char 자: 0.523821738602152 %
[45] Char 정: 0.5080710849500254 %
[46] Char 부: 0.49367048732522395 %
[47] Char 었: 0.4792698897004226 %
[48] Char 소: 0.44236835828686893 %
[49] Char 보: 0.43606809682601827 %
[50] Char 우: 0.43291796609559297 %
[51] Char 레: 0.432467947419818 %
[52] Char 발: 0.4243676112558671 %
[53] Char 행: 0.4243676112558671 %
[54] Char 여: 0.4054668268733152 %
[55] Char 며: 0.4050168081975402 %
[56] Char 와: 0.4036667521702151 %
[57] Char 면: 0.39556641600626424 %
[58] Char 명: 0.3802657810299127 %
[59] Char 된: 0.37801568765103755 %
[60] Char 화: 0.3775656689752625 %
[61] Char 만: 0.36721523943243645 %
[62] Char 중: 0.3667652207566614 %
[63] Char 상: 0.35146458578030987 %
[64] Char 공: 0.3478644363741095 %
[65] Char 것: 0.34516432431945926 %
[66] Char 오: 0.342464212264809 %
[67] Char 원: 0.3366139694797334 %
[68] Char 궤: 0.33391385742508317 %
[69] Char 마: 0.32941367066733274 %
[70] Char 문: 0.3285136333157826 %
[71] Char 계: 0.32581352126113233 %
[72] Char 유: 0.32176335317915694 %
[73] Char 제: 0.3190632411245067 %
[74] Char 영: 0.31771318509718155 %
[75] Char 장: 0.31546309171830633 %
[76] Char 양: 0.3064627182028054 %
[77] Char 국: 0.3060126995270304 %
[78] Char 게: 0.3051126621754803 %
[79] Char 관: 0.3042126248239302 %
[80] Char 니: 0.3033125874723801 %
[81] Char 였: 0.2992624193904047 %
[82] Char 데: 0.2965623073357544 %
[83] Char 경: 0.29431221395687923 %
[84] Char 체: 0.29296215792955405 %
[85] Char 분: 0.292512139253779 %
[86] Char 동: 0.29161210190222897 %
[87] Char 반: 0.28081165368362787 %
[88] Char 름: 0.27721150427742747 %
[89] Char 신: 0.27721150427742747 %
[90] Char 진: 0.27046122414080187 %
[91] Char 비: 0.2695611867892517 %
[92] Char 세: 0.2664110560588265 %
[93] Char 갈: 0.2659610373830514 %
[94] Char 태: 0.2632609253284011 %
[95] Char 음: 0.2610108319495259 %
[96] Char 했: 0.2556106078402254 %
[97] Char 월: 0.2556106078402254 %
[98] Char 메: 0.25201045843402503 %
[99] Char 작: 0.25111042108247494 %
[100] Char 형: 0.24931034637937474 %
[101] Char 견: 0.24841030902782463 %
[102] Char 릴: 0.24796029035204964 %
[103] Char 모: 0.24661023432472448 %
[104] Char 학: 0.24526017829739932 %
[105] Char 선: 0.24481015962162428 %
[106] Char 용: 0.2439101222700742 %
[107] Char 러: 0.24211004756697405 %
[108] Char 각: 0.241660028891199 %
[109] Char 천: 0.2385098981607737 %
[110] Char 크: 0.2358097861061234 %
[111] Char 미: 0.23535976743034837 %
[112] Char 연: 0.23310967405147315 %
[113] Char 점: 0.2308595806725979 %
[114] Char 트: 0.22815946861794767 %
[115] Char 토: 0.22590937523907242 %
[116] Char 물: 0.22590937523907242 %
[117] Char 역: 0.22590937523907242 %
[118] Char 때: 0.2236592818601972 %
[119] Char 거: 0.22230922583287208 %
[120] Char 표: 0.22095916980554692 %
[121] Char 간: 0.22005913245399683 %
[122] Char 터: 0.21915909510244677 %
[123] Char 돌: 0.2178090390751216 %
[124] Char 질: 0.21735902039934657 %
[125] Char 르: 0.21510892702047132 %
[126] Char 파: 0.21420888966892127 %
[127] Char 재: 0.21375887099314622 %
[128] Char 운: 0.2124088149658211 %
[129] Char 개: 0.21015872158694585 %
[130] Char 군: 0.2043084788018703 %
[131] Char 바: 0.20025831071989486 %
[132] Char 까: 0.19890825469256976 %
[133] Char 산: 0.19215797455594408 %
[134] Char 측: 0.191707955880169 %
[135] Char 속: 0.191707955880169 %
[136] Char 않: 0.191257937204394 %
[137] Char 달: 0.1845076570677683 %
[138] Char 조: 0.18180754501311805 %
[139] Char 충: 0.181357526337343 %
[140] Char 추: 0.17820739560691767 %
[141] Char 두: 0.17820739560691767 %
[142] Char 드: 0.1764073209038175 %
[143] Char 또: 0.17460724620071733 %
[144] Char 생: 0.17370720884916724 %
[145] Char 통: 0.17055707811874193 %
[146] Char 왕: 0.16875700341564176 %
[147] Char 방: 0.16695692871254159 %
[148] Char 함: 0.16470683533366637 %
[149] Char 안: 0.1633567793063412 %
[150] Char 합: 0.15930661122436582 %
[151] Char 불: 0.15930661122436582 %
[152] Char 히: 0.1584065738728157 %
[153] Char 내: 0.15660649916971553 %
[154] Char 후: 0.15570646181816544 %
[155] Char 타: 0.1530063497635152 %
[156] Char 약: 0.1521063124119651 %
[157] Char 른: 0.151206275060415 %
[158] Char 할: 0.151206275060415 %
[159] Char 호: 0.15075625638463996 %
[160] Char 네: 0.15075625638463996 %
[161] Char 포: 0.1489561816815398 %
[162] Char 심: 0.1480561443299897 %
[163] Char 탐: 0.14760612565421466 %
[164] Char 같: 0.14580605095111449 %
[165] Char 등: 0.14580605095111449 %
[166] Char 번: 0.14535603227533944 %
[167] Char 량: 0.14445599492378933 %
[168] Char 더: 0.14400597624801428 %
[169] Char 현: 0.14310593889646422 %
[170] Char 치: 0.13950578949026385 %
[171] Char 외: 0.1390557708144888 %
[172] Char 력: 0.13770571478716367 %
[173] Char 칼: 0.1359056400840635 %
[174] Char 따: 0.13275550935363817 %
[175] Char 금: 0.13185547200208808 %
[176] Char 실: 0.12645524789278756 %
[177] Char 카: 0.12645524789278756 %
[178] Char 차: 0.12510519186546243 %
[179] Char 단: 0.12240507981081214 %
[180] Char 프: 0.12240507981081214 %
[181] Char 테: 0.1219550611350371 %
[182] Char 규: 0.12150504245926207 %
[183] Char 결: 0.12150504245926207 %
[184] Char 독: 0.1179048930530617 %
[185] Char 존: 0.11610481834996153 %
[186] Char 변: 0.11475476232263639 %
[187] Char 매: 0.11475476232263639 %
[188] Char 루: 0.11430474364686136 %
[189] Char 무: 0.11115461291643604 %
[190] Char 석: 0.11070459424066098 %
[191] Char 져: 0.11070459424066098 %
[192] Char 근: 0.10935453821333586 %
[193] Char 초: 0.10755446351023566 %
[194] Char 향: 0.10665442615868559 %
[195] Char 설: 0.10485435145558542 %
[196] Char 째: 0.10395431410403531 %
[197] Char 본: 0.10395431410403531 %
[198] Char 직: 0.09765405264318469 %
[199] Char 았: 0.09720403396740965 %
[200] Char 말: 0.09720403396740965 %
[201] Char 식: 0.0967540152916346 %
[202] Char 많: 0.09630399661585956 %
[203] Char 저: 0.09630399661585956 %
[204] Char 요: 0.0958539779400845 %
[205] Char 교: 0.0958539779400845 %
[206] Char 련: 0.09540395926430947 %
[207] Char 회: 0.09540395926430947 %
[208] Char 알: 0.09135379118233407 %
[209] Char 려: 0.09045375383078398 %
[210] Char 배: 0.09045375383078398 %
[211] Char 열: 0.0877536417761337 %
[212] Char 칙: 0.08640358574880858 %
[213] Char 티: 0.08640358574880858 %
[214] Char 온: 0.08595356707303353 %
[215] Char 처: 0.08550354839725849 %
[216] Char 당: 0.08505352972148343 %
[217] Char 색: 0.08505352972148343 %
[218] Char 큰: 0.0846035110457084 %
[219] Char 임: 0.08415349236993334 %
[220] Char 예: 0.08325345501838326 %
[221] Char 권: 0.08280343634260821 %
[222] Char 순: 0.08280343634260821 %
[223] Char 없: 0.08190339899105813 %
[224] Char 던: 0.08010332428795795 %
[225] Char 케: 0.07965330561218291 %
[226] Char 항: 0.07920328693640785 %
[227] Char 최: 0.07875326826063282 %
[228] Char 강: 0.07830324958485776 %
[229] Char 률: 0.07830324958485776 %
[230] Char 망: 0.07830324958485776 %
[231] Char 론: 0.0765031748817576 %
[232] Char 쪽: 0.0756031375302075 %
[233] Char 붙: 0.07470310017865742 %
[234] Char 평: 0.07470310017865742 %
[235] Char 확: 0.07425308150288236 %
[236] Char 얼: 0.07335304415133227 %
[237] Char 래: 0.07290302547555724 %
[238] Char 밝: 0.07110295077245705 %
[239] Char 류: 0.07020291342090697 %
[240] Char 준: 0.06975289474513192 %
[241] Char 노: 0.06885285739358184 %
[242] Char 능: 0.06840283871780678 %
[243] Char 록: 0.06795282004203175 %
[244] Char 퍼: 0.06660276401470662 %
[245] Char 졌: 0.06615274533893156 %
[246] Char 받: 0.06615274533893156 %
[247] Char 획: 0.06615274533893156 %
[248] Char 키: 0.06390265196005634 %
[249] Char 밀: 0.06390265196005634 %
[250] Char 겨: 0.06390265196005634 %
[251] Char 민: 0.06300261460850626 %
[252] Char 플: 0.06255259593273121 %
[253] Char 피: 0.06120253990540607 %
[254] Char 탄: 0.06075252122963103 %
[255] Char 증: 0.05985248387808094 %
[256] Char 암: 0.059402465202305896 %
[257] Char 남: 0.05895244652653085 %
[258] Char 별: 0.05895244652653085 %
[259] Char 층: 0.0585024278507558 %
[260] Char 슘: 0.0585024278507558 %
[261] Char 베: 0.0585024278507558 %
[262] Char 쟁: 0.0585024278507558 %
[263] Char 출: 0.058052409174980765 %
[264] Char 축: 0.058052409174980765 %
[265] Char 특: 0.058052409174980765 %
[266] Char 압: 0.058052409174980765 %
[267] Char 극: 0.058052409174980765 %
[268] Char 높: 0.057602390499205715 %
[269] Char 디: 0.057602390499205715 %
[270] Char 코: 0.057602390499205715 %
[271] Char 철: 0.05715237182343068 %
[272] Char 종: 0.05715237182343068 %
[273] Char 란: 0.05670235314765563 %
[274] Char 탈: 0.05625233447188058 %
[275] Char 격: 0.05535229712033049 %
[276] Char 입: 0.05490227844455545 %
[277] Char 접: 0.0544522597687804 %
[278] Char 법: 0.0544522597687804 %
[279] Char 령: 0.05400224109300536 %
[280] Char 및: 0.053552222417230316 %
[281] Char 야: 0.05265218506568022 %
[282] Char 광: 0.052202166389905186 %
[283] Char 난: 0.051752147714130135 %
[284] Char 감: 0.051302129038355085 %
[285] Char 판: 0.05085211036258005 %
[286] Char 승: 0.05085211036258005 %
[287] Char 새: 0.05085211036258005 %
[288] Char 랑: 0.04995207301102996 %
[289] Char 급: 0.048602016983704824 %
[290] Char 건: 0.04815199830792978 %
[291] Char 페: 0.04815199830792978 %
[292] Char 투: 0.04725196095637969 %
[293] Char 립: 0.04680194228060464 %
[294] Char 폭: 0.0463519236048296 %
[295] Char 황: 0.045901904929054556 %
[296] Char 머: 0.045901904929054556 %
[297] Char 버: 0.04545188625327951 %
[298] Char 핵: 0.04500186757750446 %
[299] Char 든: 0.04500186757750446 %
[300] Char 틀: 0.044101830225954375 %
[301] Char 몇: 0.04320179287440429 %
[302] Char 날: 0.0423017555228542 %
[303] Char 족: 0.04185173684707916 %
[304] Char 럽: 0.04185173684707916 %
[305] Char 언: 0.04185173684707916 %
[306] Char 칭: 0.04185173684707916 %
[307] Char 풍: 0.04140171817130411 %
[308] Char 착: 0.04140171817130411 %
[309] Char 럼: 0.04140171817130411 %
[310] Char 균: 0.04140171817130411 %
[311] Char 복: 0.04140171817130411 %
[312] Char 집: 0.04140171817130411 %
[313] Char 너: 0.04095169949552906 %
[314] Char 움: 0.04095169949552906 %
[315] Char 낮: 0.04050168081975402 %
[316] Char 린: 0.04050168081975402 %
[317] Char 람: 0.04050168081975402 %
[318] Char 술: 0.040051662143978976 %
[319] Char 허: 0.040051662143978976 %
[320] Char 슷: 0.039601643468203926 %
[321] Char 응: 0.0382515874408788 %
[322] Char 찰: 0.03780156876510375 %
[323] Char 클: 0.03780156876510375 %
[324] Char 될: 0.03735155008932871 %
[325] Char 백: 0.03645151273777862 %
[326] Char 뉴: 0.03465143803467843 %
[327] Char 완: 0.03420141935890339 %
[328] Char 료: 0.033751400683128346 %
[329] Char 쓰: 0.033751400683128346 %
[330] Char 긴: 0.03330138200735331 %
[331] Char 편: 0.03330138200735331 %
[332] Char 떨: 0.03285136333157826 %
[333] Char 맨: 0.03285136333157826 %
[334] Char 첫: 0.03285136333157826 %
[335] Char 침: 0.03285136333157826 %
[336] Char 폴: 0.032401344655803216 %
[337] Char 왜: 0.032401344655803216 %
[338] Char 활: 0.03195132598002817 %
[339] Char 험: 0.03195132598002817 %
[340] Char 율: 0.03150130730425313 %
[341] Char 멘: 0.03150130730425313 %
[342] Char 습: 0.03150130730425313 %
[343] Char 늘: 0.03105128862847808 %
[344] Char 얻: 0.030601269952703035 %
[345] Char 환: 0.03015125127692799 %
[346] Char 울: 0.03015125127692799 %
[347] Char 깥: 0.03015125127692799 %
[348] Char 곳: 0.03015125127692799 %
[349] Char 북: 0.03015125127692799 %
[350] Char 왔: 0.029701232601152948 %
[351] Char 났: 0.029701232601152948 %
[352] Char 맹: 0.029701232601152948 %
[353] Char 염: 0.0292512139253779 %
[354] Char 먼: 0.0292512139253779 %
[355] Char 느: 0.0292512139253779 %
[356] Char 혜: 0.0292512139253779 %
[357] Char 킬: 0.0292512139253779 %
[358] Char 절: 0.0292512139253779 %
[359] Char 볼: 0.0292512139253779 %
[360] Char 줄: 0.0292512139253779 %
[361] Char 헤: 0.0292512139253779 %
[362] Char 필: 0.028351176573827814 %
[363] Char 센: 0.02790115789805277 %
[364] Char 값: 0.026551101870727636 %
[365] Char 품: 0.026551101870727636 %
[366] Char 참: 0.026101083194952593 %
[367] Char 륙: 0.026101083194952593 %
[368] Char 잡: 0.025651064519177542 %
[369] Char 링: 0.025651064519177542 %
[370] Char 께: 0.0252010458434025 %
[371] Char 킨: 0.0252010458434025 %
[372] Char 흔: 0.024301008491852412 %
[373] Char 몬: 0.02385098981607737 %
[374] Char 못: 0.02385098981607737 %
[375] Char 촬: 0.02385098981607737 %
[376] Char 막: 0.02385098981607737 %
[377] Char 쳐: 0.02385098981607737 %
[378] Char 찾: 0.02340097114030232 %
[379] Char 웨: 0.02340097114030232 %
[380] Char 슬: 0.022950952464527278 %
[381] Char 둘: 0.022950952464527278 %
[382] Char 징: 0.022950952464527278 %
[383] Char 례: 0.022950952464527278 %
[384] Char 올: 0.022950952464527278 %
[385] Char 살: 0.022950952464527278 %
[386] Char 즌: 0.02250093378875223 %
[387] Char 브: 0.02250093378875223 %
[388] Char 션: 0.02250093378875223 %
[389] Char 즈: 0.021600896437202144 %
[390] Char 런: 0.021600896437202144 %
[391] Char 쿠: 0.021600896437202144 %
[392] Char 헌: 0.0211508777614271 %
[393] Char 곱: 0.0211508777614271 %
[394] Char 웅: 0.0211508777614271 %
[395] Char 헬: 0.0211508777614271 %
[396] Char 밖: 0.020700859085652053 %
[397] Char 멀: 0.020700859085652053 %
[398] Char 혀: 0.020700859085652053 %
[399] Char 빠: 0.020700859085652053 %
[400] Char 범: 0.020700859085652053 %
[401] Char 므: 0.020700859085652053 %
[402] Char 힘: 0.020700859085652053 %
[403] Char 넘: 0.02025084040987701 %
[404] Char 워: 0.02025084040987701 %
[405] Char 커: 0.02025084040987701 %
[406] Char 팀: 0.02025084040987701 %
[407] Char 뮬: 0.02025084040987701 %
[408] Char 냈: 0.019800821734101963 %
[409] Char 총: 0.019800821734101963 %
[410] Char 손: 0.019800821734101963 %
[411] Char 갖: 0.019800821734101963 %
[412] Char 빛: 0.01935080305832692 %
[413] Char 액: 0.01935080305832692 %
[414] Char 창: 0.01935080305832692 %
[415] Char 논: 0.01935080305832692 %
[416] Char 낸: 0.018900784382551876 %
[417] Char 즉: 0.018900784382551876 %
[418] Char 억: 0.018900784382551876 %
[419] Char 청: 0.018900784382551876 %
[420] Char 혹: 0.018450765706776832 %
[421] Char 블: 0.018450765706776832 %
[422] Char 책: 0.018450765706776832 %
[423] Char 찬: 0.018450765706776832 %
[424] Char 곡: 0.018000747031001785 %
[425] Char 누: 0.018000747031001785 %
[426] Char 패: 0.018000747031001785 %
[427] Char 잘: 0.018000747031001785 %
[428] Char 림: 0.017550728355226742 %
[429] Char 검: 0.017550728355226742 %
[430] Char 채: 0.017550728355226742 %
[431] Char 녹: 0.017100709679451695 %
[432] Char 괴: 0.017100709679451695 %
[433] Char 십: 0.017100709679451695 %
[434] Char 글: 0.017100709679451695 %
[435] Char 빨: 0.017100709679451695 %
[436] Char 융: 0.016650691003676655 %
[437] Char 렸: 0.016650691003676655 %
[438] Char 길: 0.016650691003676655 %
[439] Char 삼: 0.016650691003676655 %
[440] Char 협: 0.016650691003676655 %
[441] Char 잃: 0.016650691003676655 %
[442] Char 병: 0.016650691003676655 %
[443] Char 옅: 0.016200672327901608 %
[444] Char 념: 0.015750653652126564 %
[445] Char 뜻: 0.015750653652126564 %
[446] Char 켜: 0.015300634976351517 %
[447] Char 걸: 0.015300634976351517 %
[448] Char 효: 0.015300634976351517 %
[449] Char 육: 0.015300634976351517 %
[450] Char 벨: 0.015300634976351517 %
[451] Char 업: 0.015300634976351517 %
[452] Char 숫: 0.014850616300576474 %
[453] Char 틴: 0.014850616300576474 %
[454] Char 잔: 0.014850616300576474 %
[455] Char 뒤: 0.014850616300576474 %
[456] Char 벽: 0.014400597624801429 %
[457] Char 벌: 0.014400597624801429 %
[458] Char 짧: 0.014400597624801429 %
[459] Char 륨: 0.014400597624801429 %
[460] Char 친: 0.013950578949026385 %
[461] Char 섭: 0.01350056027325134 %
[462] Char 톤: 0.01350056027325134 %
[463] Char 끌: 0.01350056027325134 %
[464] Char 애: 0.013050541597476296 %
[465] Char 눈: 0.013050541597476296 %
[466] Char 담: 0.013050541597476296 %
[467] Char 캐: 0.013050541597476296 %
[468] Char 끝: 0.013050541597476296 %
[469] Char 턴: 0.01260052292170125 %
[470] Char 희: 0.01260052292170125 %
[471] Char 략: 0.01260052292170125 %
[472] Char 떤: 0.01260052292170125 %
[473] Char 깊: 0.012150504245926206 %
[474] Char 켰: 0.012150504245926206 %
[475] Char 렇: 0.012150504245926206 %
[476] Char 흡: 0.012150504245926206 %
[477] Char 겼: 0.012150504245926206 %
[478] Char 슈: 0.012150504245926206 %
[479] Char 빈: 0.012150504245926206 %
[480] Char 곽: 0.012150504245926206 %
[481] Char 앙: 0.012150504245926206 %
[482] Char 악: 0.012150504245926206 %
[483] Char 택: 0.012150504245926206 %
[484] Char 취: 0.012150504245926206 %
[485] Char 늄: 0.012150504245926206 %
[486] Char 찌: 0.01170048557015116 %
[487] Char 박: 0.01170048557015116 %
[488] Char 맞: 0.01170048557015116 %
[489] Char 앞: 0.01170048557015116 %
[490] Char 톨: 0.01170048557015116 %
[491] Char 렵: 0.01170048557015116 %
[492] Char 덮: 0.011250466894376115 %
[493] Char 펙: 0.011250466894376115 %
[494] Char 묘: 0.011250466894376115 %
[495] Char 쌍: 0.011250466894376115 %
[496] Char 덕: 0.010800448218601072 %
[497] Char 켈: 0.010800448218601072 %
[498] Char 엔: 0.010800448218601072 %
[499] Char 델: 0.010800448218601072 %
[500] Char 핀: 0.010800448218601072 %
[501] Char 힌: 0.010800448218601072 %
[502] Char 섬: 0.010800448218601072 %
[503] Char 씨: 0.010800448218601072 %
[504] Char 퇴: 0.010350429542826027 %
[505] Char 렌: 0.010350429542826027 %
[506] Char 웹: 0.010350429542826027 %
[507] Char 텐: 0.010350429542826027 %
[508] Char 섯: 0.010350429542826027 %
[509] Char 흑: 0.010350429542826027 %
[510] Char 큼: 0.009900410867050981 %
[511] Char 혼: 0.009900410867050981 %
[512] Char 써: 0.009900410867050981 %
[513] Char 슨: 0.009900410867050981 %
[514] Char 송: 0.009900410867050981 %
[515] Char 좌: 0.009900410867050981 %
[516] Char 덜: 0.009900410867050981 %
[517] Char 뀌: 0.009900410867050981 %
[518] Char 엘: 0.009900410867050981 %
[519] Char 텔: 0.009900410867050981 %
[520] Char 쪼: 0.009450392191275938 %
[521] Char 락: 0.009450392191275938 %
[522] Char 겉: 0.009450392191275938 %
[523] Char 렀: 0.009450392191275938 %
[524] Char 욕: 0.009450392191275938 %
[525] Char 힐: 0.009450392191275938 %
[526] Char 떠: 0.009000373515500893 %
[527] Char 널: 0.009000373515500893 %
[528] Char 콘: 0.009000373515500893 %
[529] Char 램: 0.009000373515500893 %
[530] Char 엄: 0.009000373515500893 %
[531] Char 룬: 0.009000373515500893 %
[532] Char 딸: 0.009000373515500893 %
[533] Char 벼: 0.009000373515500893 %
[534] Char 윙: 0.009000373515500893 %
[535] Char 휘: 0.008550354839725847 %
[536] Char 밤: 0.008550354839725847 %
[537] Char 뿐: 0.008550354839725847 %
[538] Char 곧: 0.008550354839725847 %
[539] Char 훨: 0.008550354839725847 %
[540] Char 씬: 0.008550354839725847 %
[541] Char 큐: 0.008550354839725847 %
[542] Char 숭: 0.008550354839725847 %
[543] Char 띄: 0.008100336163950804 %
[544] Char 닌: 0.008100336163950804 %
[545] Char 깝: 0.008100336163950804 %
[546] Char 흐: 0.008100336163950804 %
[547] Char 웠: 0.008100336163950804 %
[548] Char 롯: 0.008100336163950804 %
[549] Char 뜨: 0.008100336163950804 %
[550] Char 죽: 0.008100336163950804 %
[551] Char 즘: 0.008100336163950804 %
[552] Char 닉: 0.008100336163950804 %
[553] Char 붕: 0.007650317488175759 %
[554] Char 욱: 0.007650317488175759 %
[555] Char 끼: 0.007650317488175759 %
[556] Char 익: 0.007650317488175759 %
[557] Char 옛: 0.007650317488175759 %
[558] Char 붉: 0.007650317488175759 %
[559] Char 칠: 0.007650317488175759 %
[560] Char 웰: 0.007650317488175759 %
[561] Char 컸: 0.007650317488175759 %
[562] Char 씩: 0.007650317488175759 %
[563] Char 낙: 0.007650317488175759 %
[564] Char 녀: 0.007650317488175759 %
[565] Char 얇: 0.007650317488175759 %
[566] Char 싸: 0.007200298812400714 %
[567] Char 꺼: 0.007200298812400714 %
[568] Char 찍: 0.007200298812400714 %
[569] Char 랜: 0.007200298812400714 %
[570] Char 골: 0.007200298812400714 %
[571] Char 옹: 0.007200298812400714 %
[572] Char 빌: 0.007200298812400714 %
[573] Char 칸: 0.007200298812400714 %
The first 574 characters have an accumulated ratio of 0.9900230859580663.
14099 sequences found.
First 13365 (typical positive ratio): 0.995000852514919
Next 587 (13952-13365): 0.004003410059676082
Rest: 0.00099573742540493
- Processing end: 2021-03-18 22:02:18.933817

73
script/langs/ko.py Normal file
View File

@ -0,0 +1,73 @@
#!/bin/python3
# -*- coding: utf-8 -*-
# ##### BEGIN LICENSE BLOCK #####
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Jehan <jehan@girinstud.io>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ##### END LICENSE BLOCK #####
import re
## Mandatory Properties ##
# The human name for the language, in English.
name = 'Korean'
# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
# or use another catalog as a last resort.
code = 'ko'
# ASCII characters are not commonly used in Korean.
use_ascii = False
# Only statistics.
charsets = []
## Optional Properties ##
# characters.
frequent_ranges = [('', '')] # Hangul Syllables (AC00D7A3)
unicode_ranges = [('', ''), # Hangul Syllables (AC00D7A3)
('', ''), # Hangul Jamo (110011FF)
('', ''), #Hangul Compatibility Jamo (3130318F)
('', ''), # Hangul Jamo Extended-A (A960A97F)
('', ''), # Hangul Jamo Extended-B (D7B0D7FF)
('', ''), # Hangul Jamo Extended-B (D7B0D7FF) - second part
]
# The start page. Though optional, it is advised to choose one yourself.
start_pages = ['칼리스토_(위성)']
# give possibility to select another code for the Wikipedia URL.
wikipedia_code = code
# 'a' and 'A' will be considered the same character, and so on.
# This uses Python algorithm to determine upper/lower-case of a given
# character.
case_mapping = True

View File

@ -22,6 +22,7 @@ set(
LangModels/LangHebrewModel.cpp
LangModels/LangIrishModel.cpp
LangModels/LangItalianModel.cpp
LangModels/LangKoreanModel.cpp
LangModels/LangLithuanianModel.cpp
LangModels/LangLatvianModel.cpp
LangModels/LangMalteseModel.cpp

File diff suppressed because it is too large Load Diff

View File

@ -123,6 +123,7 @@ extern const LanguageModel HebrewModel;
extern const LanguageModel HungarianModel;
extern const LanguageModel IrishModel;
extern const LanguageModel ItalianModel;
extern const LanguageModel KoreanModel;
extern const LanguageModel LatvianModel;
extern const LanguageModel LithuanianModel;
extern const LanguageModel MalteseModel;

View File

@ -118,6 +118,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
langDetectors[i][j++] = new nsLanguageDetector(&ThaiModel);
langDetectors[i][j++] = new nsLanguageDetector(&TurkishModel);
langDetectors[i][j++] = new nsLanguageDetector(&VietnameseModel);
langDetectors[i][j++] = new nsLanguageDetector(&KoreanModel);
}
else
{

View File

@ -49,7 +49,7 @@
#include "nsEUCTWProber.h"
#define NUM_OF_PROBERS 8
#define NUM_OF_LANGUAGES 27
#define NUM_OF_LANGUAGES 28
class nsMBCSGroupProber: public nsCharSetProber {
public: