mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
Actually my previous commit was not handling all cases, though it was taking care of the buffer overflow triggered by the provided byte sequence. Yet I believe it was still possible to craft special input sequences too long for codePointBuffer. This additional commit would handle these other cases by processing the input in manageable sub-strings.
532 lines
16 KiB
C++
532 lines
16 KiB
C++
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
/* ***** BEGIN LICENSE BLOCK *****
|
|
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
*
|
|
* The contents of this file are subject to the Mozilla Public License Version
|
|
* 1.1 (the "License"); you may not use this file except in compliance with
|
|
* the License. You may obtain a copy of the License at
|
|
* http://www.mozilla.org/MPL/
|
|
*
|
|
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
* for the specific language governing rights and limitations under the
|
|
* License.
|
|
*
|
|
* The Original Code is Mozilla Universal charset detector code.
|
|
*
|
|
* The Initial Developer of the Original Code is
|
|
* Netscape Communications Corporation.
|
|
* Portions created by the Initial Developer are Copyright (C) 2001
|
|
* the Initial Developer. All Rights Reserved.
|
|
*
|
|
* Contributor(s):
|
|
* Shy Shalom <shooshX@gmail.com>
|
|
* Proofpoint, Inc.
|
|
*
|
|
* Alternatively, the contents of this file may be used under the terms of
|
|
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
* in which case the provisions of the GPL or the LGPL are applicable instead
|
|
* of those above. If you wish to allow use of your version of this file only
|
|
* under the terms of either the GPL or the LGPL, and not to allow others to
|
|
* use your version of this file under the terms of the MPL, indicate your
|
|
* decision by deleting the provisions above and replace them with the notice
|
|
* and other provisions required by the GPL or the LGPL. If you do not delete
|
|
* the provisions above, a recipient may use your version of this file under
|
|
* the terms of any one of the MPL, the GPL or the LGPL.
|
|
*
|
|
* ***** END LICENSE BLOCK ***** */
|
|
#include <stdio.h>
|
|
|
|
#include "nsCJKDetector.h"
|
|
#include "nsMBCSGroupProber.h"
|
|
#include "nsUniversalDetector.h"
|
|
|
|
#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
|
|
const char *ProberName[] =
|
|
{
|
|
"UTF-8",
|
|
"SJIS",
|
|
"EUC-JP",
|
|
"GB18030",
|
|
"EUC-KR",
|
|
"Big5",
|
|
"EUC-TW",
|
|
"Johab"
|
|
};
|
|
|
|
#endif
|
|
|
|
nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
|
|
{
|
|
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
|
{
|
|
mProbers[i] = nsnull;
|
|
codePointBuffer[i] = nsnull;
|
|
codePointBufferSize[i] = 0;
|
|
codePointBufferIdx[i] = 0;
|
|
}
|
|
|
|
mProbers[0] = new nsUTF8Prober();
|
|
if (aLanguageFilter & NS_FILTER_JAPANESE)
|
|
{
|
|
mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE);
|
|
mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE);
|
|
}
|
|
if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
|
|
mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED);
|
|
if (aLanguageFilter & NS_FILTER_KOREAN)
|
|
{
|
|
mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN);
|
|
mProbers[7] = new nsJohabProber(aLanguageFilter == NS_FILTER_KOREAN);
|
|
}
|
|
if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL)
|
|
{
|
|
mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
|
|
mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
|
|
}
|
|
|
|
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
|
{
|
|
if (mProbers[i]->DecodeToUnicode())
|
|
{
|
|
int j = 0;
|
|
|
|
langDetectors[i][j++] = new nsLanguageDetector(&ArabicModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&BelarusianModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&BulgarianModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&CatalanModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&CroatianModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&CzechModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&DanishModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&EnglishModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&EsperantoModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&EstonianModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&FinnishModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&FrenchModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&GermanModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&GeorgianModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&GreekModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&HebrewModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&HindiModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&HungarianModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&IrishModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&ItalianModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&LatvianModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&LithuanianModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&MacedonianModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&MalteseModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&NorwegianModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&PolishModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&PortugueseModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&RomanianModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&RussianModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&SerbianModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&SlovakModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&SloveneModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&SpanishModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&SwedishModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&ThaiModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&TurkishModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&UkrainianModel);
|
|
langDetectors[i][j++] = new nsLanguageDetector(&VietnameseModel);
|
|
langDetectors[i][j++] = new nsCJKDetector();
|
|
}
|
|
else
|
|
{
|
|
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
|
langDetectors[i][j] = nsnull;
|
|
}
|
|
}
|
|
Reset();
|
|
}
|
|
|
|
nsMBCSGroupProber::~nsMBCSGroupProber()
|
|
{
|
|
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
|
{
|
|
delete mProbers[i];
|
|
|
|
if (codePointBufferSize[i] != 0)
|
|
delete [] codePointBuffer[i];
|
|
|
|
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
|
if (langDetectors[i][j])
|
|
delete langDetectors[i][j];
|
|
}
|
|
}
|
|
|
|
#define CANDIDATE_THRESHOLD 0.3f
|
|
|
|
int nsMBCSGroupProber::GetCandidates()
|
|
{
|
|
int num_candidates = 0;
|
|
|
|
CheckCandidates();
|
|
|
|
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
|
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
|
if (candidates[i][j])
|
|
num_candidates++;
|
|
|
|
return num_candidates;
|
|
}
|
|
|
|
const char* nsMBCSGroupProber::GetCharSetName(int candidate)
|
|
{
|
|
int num_candidates = GetCandidates();
|
|
int candidate_it = 0;
|
|
|
|
if (num_candidates == 0)
|
|
return NULL;
|
|
else if (candidate >= num_candidates)
|
|
/* Just show the first candidate. */
|
|
candidate = 0;
|
|
|
|
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
|
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
|
if (candidates[i][j])
|
|
{
|
|
if (candidate == candidate_it)
|
|
{
|
|
/* We assume that probers included in the nsMBCSGroupProber
|
|
* return only one candidate themselves.
|
|
* */
|
|
return mProbers[i]->GetCharSetName(0);
|
|
}
|
|
candidate_it++;
|
|
}
|
|
|
|
/* Should not happen. */
|
|
return NULL;
|
|
}
|
|
|
|
const char* nsMBCSGroupProber::GetLanguage(int candidate)
|
|
{
|
|
const char* lang = NULL;
|
|
int num_candidates = GetCandidates();
|
|
int candidate_it = 0;
|
|
|
|
if (num_candidates == 0)
|
|
return NULL;
|
|
else if (candidate >= num_candidates)
|
|
/* Just show the first candidate. */
|
|
candidate = 0;
|
|
|
|
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
|
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
|
if (candidates[i][j])
|
|
{
|
|
if (candidate == candidate_it)
|
|
{
|
|
/* We assume that probers included in the nsMBCSGroupProber
|
|
* return only one candidate themselves.
|
|
* */
|
|
lang = mProbers[i]->GetLanguage(0);
|
|
|
|
if (! lang)
|
|
{
|
|
/* The prober does not come with its own language. */
|
|
if (langDetectors[i][j])
|
|
lang = langDetectors[i][j]->GetLanguage();
|
|
}
|
|
|
|
return lang;
|
|
}
|
|
candidate_it++;
|
|
}
|
|
|
|
return lang;
|
|
}
|
|
|
|
void nsMBCSGroupProber::Reset(void)
|
|
{
|
|
mActiveNum = 0;
|
|
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
|
{
|
|
if (mProbers[i])
|
|
{
|
|
mProbers[i]->Reset();
|
|
mIsActive[i] = PR_TRUE;
|
|
++mActiveNum;
|
|
|
|
if (codePointBufferSize[i] == 0 && mProbers[i]->DecodeToUnicode())
|
|
{
|
|
codePointBufferSize[i] = 1024;
|
|
codePointBuffer[i] = new int[codePointBufferSize[i]];
|
|
}
|
|
codePointBufferIdx[i] = 0;
|
|
}
|
|
else
|
|
mIsActive[i] = PR_FALSE;
|
|
|
|
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
|
{
|
|
if (langDetectors[i][j])
|
|
langDetectors[i][j]->Reset();
|
|
|
|
candidates[i][j] = false;
|
|
}
|
|
}
|
|
mState = eDetecting;
|
|
mKeepNext = 0;
|
|
}
|
|
|
|
nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
|
|
int** cpBuffer,
|
|
int* cpBufferIdx)
|
|
{
|
|
nsProbingState st;
|
|
PRUint32 start = 0;
|
|
PRUint32 keepNext = mKeepNext;
|
|
|
|
//do filtering to reduce load to probers
|
|
for (PRUint32 pos = 0; pos < aLen; ++pos)
|
|
{
|
|
if (aBuf[pos] & 0x80)
|
|
{
|
|
if (!keepNext)
|
|
start = pos;
|
|
keepNext = 2;
|
|
}
|
|
else if (keepNext)
|
|
{
|
|
if (--keepNext == 0)
|
|
{
|
|
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
|
{
|
|
int sequenceLength;
|
|
|
|
if (!mIsActive[i])
|
|
continue;
|
|
|
|
sequenceLength = pos + 1 - start;
|
|
|
|
if (codePointBuffer[i] && codePointBufferIdx[i] + sequenceLength > codePointBufferSize[i])
|
|
{
|
|
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
|
langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]);
|
|
codePointBufferIdx[i] = 0;
|
|
}
|
|
|
|
if (codePointBuffer[i])
|
|
{
|
|
while (sequenceLength > 0)
|
|
{
|
|
int subLength = (sequenceLength > codePointBufferSize[i]) ? codePointBufferSize[i] : sequenceLength;
|
|
|
|
st = mProbers[i]->HandleData(aBuf + start, subLength,
|
|
&(codePointBuffer[i]), &(codePointBufferIdx[i]));
|
|
|
|
if (codePointBufferIdx[i] > 0)
|
|
{
|
|
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
|
langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]);
|
|
codePointBufferIdx[i] = 0;
|
|
}
|
|
|
|
sequenceLength -= subLength;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
st = mProbers[i]->HandleData(aBuf + start, sequenceLength, NULL, NULL);
|
|
}
|
|
|
|
if (st == eFoundIt)
|
|
{
|
|
float cf = mProbers[i]->GetConfidence(0);
|
|
|
|
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
|
{
|
|
float langConf = langDetectors[i][j] ? langDetectors[i][j]->GetConfidence() : 1.0;
|
|
|
|
if (cf * langConf > CANDIDATE_THRESHOLD)
|
|
{
|
|
/* There is at least one (charset, lang) couple for
|
|
* which the confidence is high enough.
|
|
*/
|
|
mState = eFoundIt;
|
|
return mState;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
|
{
|
|
if (codePointBuffer[i])
|
|
{
|
|
if (codePointBufferIdx[i] == codePointBufferSize[i] - 1)
|
|
{
|
|
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
|
langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]);
|
|
codePointBufferIdx[i] = 0;
|
|
}
|
|
|
|
codePointBuffer[i][(codePointBufferIdx[i])++] = aBuf[pos];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (keepNext) {
|
|
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
|
{
|
|
if (!mIsActive[i])
|
|
continue;
|
|
|
|
if (codePointBuffer[i])
|
|
st = mProbers[i]->HandleData(aBuf + start, aLen - start,
|
|
&(codePointBuffer[i]), &(codePointBufferIdx[i]));
|
|
else
|
|
st = mProbers[i]->HandleData(aBuf + start, aLen - start, NULL, NULL);
|
|
|
|
if (codePointBufferIdx[i] > 0 && codePointBuffer[i])
|
|
{
|
|
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
|
langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]);
|
|
codePointBufferIdx[i] = 0;
|
|
}
|
|
|
|
if (st == eFoundIt)
|
|
{
|
|
float cf = mProbers[i]->GetConfidence(0);
|
|
|
|
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
|
{
|
|
float langConf = langDetectors[i][j] ? langDetectors[i][j]->GetConfidence() : 1.0;
|
|
|
|
if (cf * langConf > CANDIDATE_THRESHOLD)
|
|
{
|
|
/* There is at least one (charset, lang) couple for
|
|
* which the confidence is high enough.
|
|
*/
|
|
mState = eFoundIt;
|
|
return mState;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
mKeepNext = keepNext;
|
|
|
|
return mState;
|
|
}
|
|
|
|
void nsMBCSGroupProber::CheckCandidates()
|
|
{
|
|
for (int i = 0; i < NUM_OF_PROBERS; i++)
|
|
{
|
|
if (! mIsActive[i])
|
|
{
|
|
for (int j = 0; j < NUM_OF_LANGUAGES; j++)
|
|
candidates[i][j] = false;
|
|
}
|
|
else
|
|
{
|
|
float cf = mProbers[i]->GetConfidence(0);
|
|
|
|
if (mProbers[i]->DecodeToUnicode())
|
|
{
|
|
for (int j = 0; j < NUM_OF_LANGUAGES; j++)
|
|
{
|
|
float langConf;
|
|
|
|
/* Process any remaining language data first. */
|
|
if (codePointBufferIdx[i] > 0 && codePointBuffer[i])
|
|
langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]);
|
|
|
|
/* Now check the confidence in this (charset, lang) couple. */
|
|
langConf = langDetectors[i][j]->GetConfidence();
|
|
candidates[i][j] = (cf * langConf > CANDIDATE_THRESHOLD);
|
|
}
|
|
codePointBufferIdx[i] = 0;
|
|
}
|
|
else
|
|
{
|
|
for (int j = 0; j < NUM_OF_LANGUAGES; j++)
|
|
candidates[i][j] = (cf > CANDIDATE_THRESHOLD);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
float nsMBCSGroupProber::GetConfidence(int candidate)
|
|
{
|
|
int num_candidates = GetCandidates();
|
|
int candidate_it = 0;
|
|
|
|
PRUint32 i;
|
|
|
|
if (num_candidates == 0)
|
|
return 0.0;
|
|
else if (candidate >= num_candidates)
|
|
/* Just show the first candidate. */
|
|
candidate = 0;
|
|
|
|
switch (mState)
|
|
{
|
|
case eNotMe:
|
|
return (float)0.01;
|
|
case eFoundIt:
|
|
default:
|
|
for (i = 0; i < NUM_OF_PROBERS; i++)
|
|
{
|
|
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
|
if (candidates[i][j])
|
|
{
|
|
if (candidate == candidate_it)
|
|
{
|
|
float cf = mProbers[i]->GetConfidence(0);
|
|
float langConf = 1.0;
|
|
|
|
if (langDetectors[i][j])
|
|
langConf = langDetectors[i][j]->GetConfidence();
|
|
|
|
return cf * langConf;
|
|
}
|
|
candidate_it++;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Should not happen. */
|
|
return 0.0;
|
|
}
|
|
|
|
#ifdef DEBUG_chardet
|
|
void nsMBCSGroupProber::DumpStatus()
|
|
{
|
|
PRUint32 i;
|
|
float cf;
|
|
|
|
GetConfidence(0);
|
|
for (i = 0; i < NUM_OF_PROBERS; i++)
|
|
{
|
|
if (!mIsActive[i])
|
|
printf(" MBCS inactive: [%s] (confidence is too low).\r\n", ProberName[i]);
|
|
else
|
|
{
|
|
cf = mProbers[i]->GetConfidence(0);
|
|
printf(" MBCS %1.3f: [%s]\r\n", cf, ProberName[i]);
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#ifdef DEBUG_jgmyers
|
|
void nsMBCSGroupProber::GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], PRUint32 &offset)
|
|
{
|
|
for (PRUint32 i = 0; i < NUM_OF_PROBERS; ++i) {
|
|
states[offset].name = ProberName[i];
|
|
states[offset].isActive = mIsActive[i];
|
|
states[offset].confidence = mIsActive[i] ? mProbers[i]->GetConfidence(0) : 0.0;
|
|
++offset;
|
|
}
|
|
}
|
|
#endif /* DEBUG_jgmyers */
|