diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index af30d1c..0505a15 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -55,6 +55,7 @@ set( nsSJISProber.cpp nsUTF8Prober.cpp nsLanguageDetector.cpp + nsCJKDetector.cpp nsLatin1Prober.cpp nsUniversalDetector.cpp uchardet.cpp diff --git a/src/nsCJKDetector.cpp b/src/nsCJKDetector.cpp new file mode 100644 index 0000000..7ee7f31 --- /dev/null +++ b/src/nsCJKDetector.cpp @@ -0,0 +1,239 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Universal charset detector code. This + * file was later added by Jehan in 2021 to add language detection. + * + * The Initial Developer of the Original Code is Netscape Communications + * Corporation. + * Portions created by the Initial Developer are Copyright (C) 2001 the + * Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Jehan (2021) + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#define CJK_ENOUGH_CHAR_THRESHOLD 4096 +#define CJK_POSITIVE_SHORTCUT_THRESHOLD (float)0.95 +#define CJK_NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 + +#include "nsCJKDetector.h" + +void nsCJKDetector::Reset(void) +{ + nsLanguageDetector::Reset(); + + mHangulChar = 0; + mKanaChar = 0; + mHanziChar = 0; +} + +nsDetectState nsCJKDetector::HandleData(const int* codePoints, PRUint32 cpLen) +{ + for (PRUint32 i = 0; i < cpLen; i++) + { + mTotalChar++; + + if ((codePoints[i] >= 0xAC00 && codePoints[i] <= 0xD7A3) || + (codePoints[i] >= 0x1100 && codePoints[i] <= 0x11FF) || + (codePoints[i] >= 0x3130 && codePoints[i] <= 0x318F) || + (codePoints[i] >= 0xA960 && codePoints[i] <= 0xA97F) || + (codePoints[i] >= 0xD7B0 && codePoints[i] <= 0xD7FF)) + { + mHangulChar++; + } + else if ((codePoints[i] >= 0x3041 && codePoints[i] <= 0x309F) || + (codePoints[i] >= 0x30A0 && codePoints[i] <= 0x30FF)) + { + mKanaChar++; + } + else if (codePoints[i] >= 0x4E00 && codePoints[i] <= 0x9FBF) + { + mHanziChar++; + } + else if (codePoints[i] <= 0x1F || codePoints[i] == 0x7F || /* C0 */ + (codePoints[i] <= 0x9F && codePoints[i] >= 0x80) || /* C1 */ + /* Separators: not strictly control characters for the Unicode + * standard, but we'll consider as such in our purpose. + */ + codePoints[i] == 0x2028 || codePoints[i] == 0x2029 || + /* Tags: U+E0001 is deprecated but other are still usable as + * emoji identifiers. Not sure how to use them. + */ + codePoints[i] == 0xE0001 || + /* Interlinear annotations. */ + codePoints[i] == 0xFFF9 || codePoints[i] == 0xFFFA || + codePoints[i] == 0xFFFB || + /* Bidirectional text control. */ + codePoints[i] == 0x061C || codePoints[i] == 0x200E || + codePoints[i] == 0x200F || + (codePoints[i] >= 0x202A && codePoints[i] <= 0x202E) || + (codePoints[i] >= 0x2066 && codePoints[i] <= 0x2069) || + /* Control pictures. */ + (codePoints[i] >= 0x2400 && codePoints[i] <= 0x2426)) + { + /* XXX: some control characters such as variation selectors may + * need to be considered separately (basically just as if they + * were not here and simply skipped?). */ + //mCtrlChar++; + } + /* When encountering an illegal codepoint, no need + * to continue analyzing data. It means this is not right, hence + * that the encoding we deducted this codepoint from is wrong. + * Unfortunately listing all illegal codePoints in Unicode might be + * a daunting task and comparing each characters to all these + * illegal codePoints would be a lot of additional work. Is it + * really necessary? XXX + */ + else if (/* Tab, line feed and carriage returns are common enough + * that they should be considered as commonly used characters. + */ + codePoints[i] == 0x9 || codePoints[i] == 0xA || codePoints[i] == 0xd || + (codePoints[i] >= 0x20 && codePoints[i] <= 0x40) || + (codePoints[i] >= 0x5B && codePoints[i] <= 0x5F) || + (codePoints[i] >= 0x7B && codePoints[i] <= 0x7E) || + (codePoints[i] >= 0xA0 && codePoints[i] <= 0xA5) || + (codePoints[i] >= 0xA0 && codePoints[i] <= 0xB4) || + (codePoints[i] >= 0xB6 && codePoints[i] <= 0xBF) || + codePoints[i] == 0xD7 || + codePoints[i] == 0xF7 || + /* General Punctuation */ + (codePoints[i] >= 0x2000 && codePoints[i] <= 0x206F) || + /* Vertical Forms */ + (codePoints[i] >= 0xFE10 && codePoints[i] <= 0xFE1F) || + /* CJK Symbols and Punctuation */ + (codePoints[i] >= 0x3000 && codePoints[i] <= 0x303F) || + /* Halfwidth and Fullwidth Forms */ + (codePoints[i] >= 0xFF00 && codePoints[i] <= 0xFFEF)) + { + /* Punctuations, various symbols, even numbers are simply + * ignored. + * As for halfwidth and fullwidth characters, I'm not sure what + * to do with them, but let's go with the same logics of + * skipping them, at least for now.. + */ + //mVariousBetween++; + } + else if (/* Common Ctrl except the ones considered as common chars. */ + (codePoints[i] >= 0x1F600 && codePoints[i] <= 0x1F64F) || + codePoints[i] == 0xFE0E || codePoints[i] == 0xFE0F || + (codePoints[i] >= 0x1F3FB && codePoints[i] <= 0x1F3FF) || + /* Miscellaneous Symbols */ + (codePoints[i] >= 0x2600 && codePoints[i] <= 0x26FF) || + /* Supplemental Symbols and Pictographs */ + (codePoints[i] >= 0x1F90C && codePoints[i] <= 0x1F93A) || + (codePoints[i] >= 0x1F93C && codePoints[i] <= 0x1F945) || + (codePoints[i] >= 0x1F947 && codePoints[i] <= 0x1F978) || + (codePoints[i] >= 0x1F97A && codePoints[i] <= 0x1F9CB) || + (codePoints[i] >= 0x1F9CD && codePoints[i] <= 0x1F9FF) || + /* Miscellaneous Symbols and Pictographs */ + (codePoints[i] >= 0x1F300 && codePoints[i] <= 0x1F5FF) || + /* Transport and Map Symbols */ + (codePoints[i] >= 0x1F680 && codePoints[i] <= 0x1F6FF) || + /* Dingbat */ + (codePoints[i] >= 0x2700 && codePoints[i] <= 0x27BF)) + { + //mEmoticons++; + } + else + { + /* All the rest is to be considered as non-frequent characters. + * These are not disqualifying (we may also have a text with a bit + * of foreign quotes in it or very unusual characters sometimes) + * but they will drop a bit the confidence. + */ + mOutChar++; + } + } + + if (mState == STATE_DETECTING) + if (mTotalChar > CJK_ENOUGH_CHAR_THRESHOLD) + { + ComputeConfidence(); + if (confidence > CJK_POSITIVE_SHORTCUT_THRESHOLD) + mState = STATE_FOUND; + else if (confidence < CJK_NEGATIVE_SHORTCUT_THRESHOLD) + mState = STATE_UNLIKELY; + } + + return mState; +} + +#include +float nsCJKDetector::GetConfidence(void) +{ + ComputeConfidence(); + + return confidence; +} + +const char* nsCJKDetector::GetLanguage() +{ + ComputeConfidence(); + + return language; +} + +void nsCJKDetector::ComputeConfidence(void) +{ + float confKo = 0.01f;; + float confJa = 0.01f;; + float confZh = 0.01f;; + float all_chars = (float) (mOutChar + mHanziChar + mHangulChar + mKanaChar); + float hangul_chars = (float) mHangulChar; + float hanzi_chars = (float) mHanziChar; + float kana_chars = (float) mKanaChar; + + language = NULL; + confidence = 0.01f; + + if (mTotalChar > 0) + { + confKo = hangul_chars / all_chars; + language = "ko"; + confidence = confKo; + + confZh = hanzi_chars / all_chars; + if (confZh > confKo) + { + language = "zh"; + confidence = confZh; + } + + /* Japanese still uses a lot of Chinese characters, so I think this + * very naive confidence computation will need to be revised soon. + * We should probably compute statistics of hanzi / (hanzi + kana) + * characters and use this as a weight modifier. + */ + confJa = (kana_chars + hanzi_chars / 2.0) / all_chars; + if (confJa > confidence) + { + language = "ja"; + confidence = confJa; + } + } +} diff --git a/src/nsCJKDetector.h b/src/nsCJKDetector.h new file mode 100644 index 0000000..6490aea --- /dev/null +++ b/src/nsCJKDetector.h @@ -0,0 +1,70 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Universal charset detector code. This + * file was later added by Jehan in 2021 to add language detection. + * + * The Initial Developer of the Original Code is Netscape Communications + * Corporation. + * Portions created by the Initial Developer are Copyright (C) 2001 the + * Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Jehan (2021) + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +#ifndef nsCJKDetector_h__ +#define nsCJKDetector_h__ + +#include "nsLanguageDetector.h" + +class nsCJKDetector: public nsLanguageDetector { +public: + nsCJKDetector(): nsLanguageDetector(nullptr) { Reset(); } + virtual ~nsCJKDetector() {} + + const char* GetLanguage(); + nsDetectState HandleData(const int* codepoints, PRUint32 cpLen); + float GetConfidence(void); + void Reset(void); + +protected: + /* Chinese characters (Kanji in Japanese) */ + PRUint32 mHanziChar; + /* Korean alphabet and syllabaries */ + PRUint32 mHangulChar; + /* Hiragana and Katakana (Japanese) */ + PRUint32 mKanaChar; + + const char* language; + float confidence; + +private: + + void ComputeConfidence(void); +}; + +#endif /* nsCJKDetector_h__ */ diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp index 1b99da1..f19ec25 100644 --- a/src/nsMBCSGroupProber.cpp +++ b/src/nsMBCSGroupProber.cpp @@ -38,6 +38,7 @@ * ***** END LICENSE BLOCK ***** */ #include +#include "nsCJKDetector.h" #include "nsMBCSGroupProber.h" #include "nsUniversalDetector.h" @@ -106,7 +107,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter) langDetectors[i][j++] = new nsLanguageDetector(&HungarianModel); langDetectors[i][j++] = new nsLanguageDetector(&IrishModel); langDetectors[i][j++] = new nsLanguageDetector(&ItalianModel); - langDetectors[i][j++] = new nsLanguageDetector(&KoreanModel); + //langDetectors[i][j++] = new nsLanguageDetector(&KoreanModel); langDetectors[i][j++] = new nsLanguageDetector(&LatvianModel); langDetectors[i][j++] = new nsLanguageDetector(&LithuanianModel); langDetectors[i][j++] = new nsLanguageDetector(&MalteseModel); @@ -120,6 +121,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter) langDetectors[i][j++] = new nsLanguageDetector(&ThaiModel); langDetectors[i][j++] = new nsLanguageDetector(&TurkishModel); langDetectors[i][j++] = new nsLanguageDetector(&VietnameseModel); + langDetectors[i][j++] = new nsCJKDetector(); } else {