uchardet/src/nsMBCSGroupProber.cpp
Jehan ab1d2f1120 src: handle long sequences of characters.
Actually my previous commit was not handling all cases, though it was
taking care of the buffer overflow triggered by the provided byte
sequence. Yet I believe it was still possible to craft special input
sequences too long for codePointBuffer.
This additional commit would handle these other cases by processing the
input in manageable sub-strings.
2023-07-17 20:09:10 +02:00

532 lines
16 KiB
C++

/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Proofpoint, Inc.
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <stdio.h>
#include "nsCJKDetector.h"
#include "nsMBCSGroupProber.h"
#include "nsUniversalDetector.h"
#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
const char *ProberName[] =
{
"UTF-8",
"SJIS",
"EUC-JP",
"GB18030",
"EUC-KR",
"Big5",
"EUC-TW",
"Johab"
};
#endif
nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
{
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
{
mProbers[i] = nsnull;
codePointBuffer[i] = nsnull;
codePointBufferSize[i] = 0;
codePointBufferIdx[i] = 0;
}
mProbers[0] = new nsUTF8Prober();
if (aLanguageFilter & NS_FILTER_JAPANESE)
{
mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE);
mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE);
}
if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED);
if (aLanguageFilter & NS_FILTER_KOREAN)
{
mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN);
mProbers[7] = new nsJohabProber(aLanguageFilter == NS_FILTER_KOREAN);
}
if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL)
{
mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
}
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
{
if (mProbers[i]->DecodeToUnicode())
{
int j = 0;
langDetectors[i][j++] = new nsLanguageDetector(&ArabicModel);
langDetectors[i][j++] = new nsLanguageDetector(&BelarusianModel);
langDetectors[i][j++] = new nsLanguageDetector(&BulgarianModel);
langDetectors[i][j++] = new nsLanguageDetector(&CatalanModel);
langDetectors[i][j++] = new nsLanguageDetector(&CroatianModel);
langDetectors[i][j++] = new nsLanguageDetector(&CzechModel);
langDetectors[i][j++] = new nsLanguageDetector(&DanishModel);
langDetectors[i][j++] = new nsLanguageDetector(&EnglishModel);
langDetectors[i][j++] = new nsLanguageDetector(&EsperantoModel);
langDetectors[i][j++] = new nsLanguageDetector(&EstonianModel);
langDetectors[i][j++] = new nsLanguageDetector(&FinnishModel);
langDetectors[i][j++] = new nsLanguageDetector(&FrenchModel);
langDetectors[i][j++] = new nsLanguageDetector(&GermanModel);
langDetectors[i][j++] = new nsLanguageDetector(&GeorgianModel);
langDetectors[i][j++] = new nsLanguageDetector(&GreekModel);
langDetectors[i][j++] = new nsLanguageDetector(&HebrewModel);
langDetectors[i][j++] = new nsLanguageDetector(&HindiModel);
langDetectors[i][j++] = new nsLanguageDetector(&HungarianModel);
langDetectors[i][j++] = new nsLanguageDetector(&IrishModel);
langDetectors[i][j++] = new nsLanguageDetector(&ItalianModel);
langDetectors[i][j++] = new nsLanguageDetector(&LatvianModel);
langDetectors[i][j++] = new nsLanguageDetector(&LithuanianModel);
langDetectors[i][j++] = new nsLanguageDetector(&MacedonianModel);
langDetectors[i][j++] = new nsLanguageDetector(&MalteseModel);
langDetectors[i][j++] = new nsLanguageDetector(&NorwegianModel);
langDetectors[i][j++] = new nsLanguageDetector(&PolishModel);
langDetectors[i][j++] = new nsLanguageDetector(&PortugueseModel);
langDetectors[i][j++] = new nsLanguageDetector(&RomanianModel);
langDetectors[i][j++] = new nsLanguageDetector(&RussianModel);
langDetectors[i][j++] = new nsLanguageDetector(&SerbianModel);
langDetectors[i][j++] = new nsLanguageDetector(&SlovakModel);
langDetectors[i][j++] = new nsLanguageDetector(&SloveneModel);
langDetectors[i][j++] = new nsLanguageDetector(&SpanishModel);
langDetectors[i][j++] = new nsLanguageDetector(&SwedishModel);
langDetectors[i][j++] = new nsLanguageDetector(&ThaiModel);
langDetectors[i][j++] = new nsLanguageDetector(&TurkishModel);
langDetectors[i][j++] = new nsLanguageDetector(&UkrainianModel);
langDetectors[i][j++] = new nsLanguageDetector(&VietnameseModel);
langDetectors[i][j++] = new nsCJKDetector();
}
else
{
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
langDetectors[i][j] = nsnull;
}
}
Reset();
}
nsMBCSGroupProber::~nsMBCSGroupProber()
{
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
{
delete mProbers[i];
if (codePointBufferSize[i] != 0)
delete [] codePointBuffer[i];
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
if (langDetectors[i][j])
delete langDetectors[i][j];
}
}
#define CANDIDATE_THRESHOLD 0.3f
int nsMBCSGroupProber::GetCandidates()
{
int num_candidates = 0;
CheckCandidates();
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
if (candidates[i][j])
num_candidates++;
return num_candidates;
}
const char* nsMBCSGroupProber::GetCharSetName(int candidate)
{
int num_candidates = GetCandidates();
int candidate_it = 0;
if (num_candidates == 0)
return NULL;
else if (candidate >= num_candidates)
/* Just show the first candidate. */
candidate = 0;
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
if (candidates[i][j])
{
if (candidate == candidate_it)
{
/* We assume that probers included in the nsMBCSGroupProber
* return only one candidate themselves.
* */
return mProbers[i]->GetCharSetName(0);
}
candidate_it++;
}
/* Should not happen. */
return NULL;
}
const char* nsMBCSGroupProber::GetLanguage(int candidate)
{
const char* lang = NULL;
int num_candidates = GetCandidates();
int candidate_it = 0;
if (num_candidates == 0)
return NULL;
else if (candidate >= num_candidates)
/* Just show the first candidate. */
candidate = 0;
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
if (candidates[i][j])
{
if (candidate == candidate_it)
{
/* We assume that probers included in the nsMBCSGroupProber
* return only one candidate themselves.
* */
lang = mProbers[i]->GetLanguage(0);
if (! lang)
{
/* The prober does not come with its own language. */
if (langDetectors[i][j])
lang = langDetectors[i][j]->GetLanguage();
}
return lang;
}
candidate_it++;
}
return lang;
}
void nsMBCSGroupProber::Reset(void)
{
mActiveNum = 0;
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
{
if (mProbers[i])
{
mProbers[i]->Reset();
mIsActive[i] = PR_TRUE;
++mActiveNum;
if (codePointBufferSize[i] == 0 && mProbers[i]->DecodeToUnicode())
{
codePointBufferSize[i] = 1024;
codePointBuffer[i] = new int[codePointBufferSize[i]];
}
codePointBufferIdx[i] = 0;
}
else
mIsActive[i] = PR_FALSE;
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
{
if (langDetectors[i][j])
langDetectors[i][j]->Reset();
candidates[i][j] = false;
}
}
mState = eDetecting;
mKeepNext = 0;
}
nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
int** cpBuffer,
int* cpBufferIdx)
{
nsProbingState st;
PRUint32 start = 0;
PRUint32 keepNext = mKeepNext;
//do filtering to reduce load to probers
for (PRUint32 pos = 0; pos < aLen; ++pos)
{
if (aBuf[pos] & 0x80)
{
if (!keepNext)
start = pos;
keepNext = 2;
}
else if (keepNext)
{
if (--keepNext == 0)
{
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
{
int sequenceLength;
if (!mIsActive[i])
continue;
sequenceLength = pos + 1 - start;
if (codePointBuffer[i] && codePointBufferIdx[i] + sequenceLength > codePointBufferSize[i])
{
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]);
codePointBufferIdx[i] = 0;
}
if (codePointBuffer[i])
{
while (sequenceLength > 0)
{
int subLength = (sequenceLength > codePointBufferSize[i]) ? codePointBufferSize[i] : sequenceLength;
st = mProbers[i]->HandleData(aBuf + start, subLength,
&(codePointBuffer[i]), &(codePointBufferIdx[i]));
if (codePointBufferIdx[i] > 0)
{
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]);
codePointBufferIdx[i] = 0;
}
sequenceLength -= subLength;
}
}
else
{
st = mProbers[i]->HandleData(aBuf + start, sequenceLength, NULL, NULL);
}
if (st == eFoundIt)
{
float cf = mProbers[i]->GetConfidence(0);
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
{
float langConf = langDetectors[i][j] ? langDetectors[i][j]->GetConfidence() : 1.0;
if (cf * langConf > CANDIDATE_THRESHOLD)
{
/* There is at least one (charset, lang) couple for
* which the confidence is high enough.
*/
mState = eFoundIt;
return mState;
}
}
}
}
}
}
else
{
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
{
if (codePointBuffer[i])
{
if (codePointBufferIdx[i] == codePointBufferSize[i] - 1)
{
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]);
codePointBufferIdx[i] = 0;
}
codePointBuffer[i][(codePointBufferIdx[i])++] = aBuf[pos];
}
}
}
}
if (keepNext) {
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
{
if (!mIsActive[i])
continue;
if (codePointBuffer[i])
st = mProbers[i]->HandleData(aBuf + start, aLen - start,
&(codePointBuffer[i]), &(codePointBufferIdx[i]));
else
st = mProbers[i]->HandleData(aBuf + start, aLen - start, NULL, NULL);
if (codePointBufferIdx[i] > 0 && codePointBuffer[i])
{
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]);
codePointBufferIdx[i] = 0;
}
if (st == eFoundIt)
{
float cf = mProbers[i]->GetConfidence(0);
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
{
float langConf = langDetectors[i][j] ? langDetectors[i][j]->GetConfidence() : 1.0;
if (cf * langConf > CANDIDATE_THRESHOLD)
{
/* There is at least one (charset, lang) couple for
* which the confidence is high enough.
*/
mState = eFoundIt;
return mState;
}
}
}
}
}
mKeepNext = keepNext;
return mState;
}
void nsMBCSGroupProber::CheckCandidates()
{
for (int i = 0; i < NUM_OF_PROBERS; i++)
{
if (! mIsActive[i])
{
for (int j = 0; j < NUM_OF_LANGUAGES; j++)
candidates[i][j] = false;
}
else
{
float cf = mProbers[i]->GetConfidence(0);
if (mProbers[i]->DecodeToUnicode())
{
for (int j = 0; j < NUM_OF_LANGUAGES; j++)
{
float langConf;
/* Process any remaining language data first. */
if (codePointBufferIdx[i] > 0 && codePointBuffer[i])
langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]);
/* Now check the confidence in this (charset, lang) couple. */
langConf = langDetectors[i][j]->GetConfidence();
candidates[i][j] = (cf * langConf > CANDIDATE_THRESHOLD);
}
codePointBufferIdx[i] = 0;
}
else
{
for (int j = 0; j < NUM_OF_LANGUAGES; j++)
candidates[i][j] = (cf > CANDIDATE_THRESHOLD);
}
}
}
}
float nsMBCSGroupProber::GetConfidence(int candidate)
{
int num_candidates = GetCandidates();
int candidate_it = 0;
PRUint32 i;
if (num_candidates == 0)
return 0.0;
else if (candidate >= num_candidates)
/* Just show the first candidate. */
candidate = 0;
switch (mState)
{
case eNotMe:
return (float)0.01;
case eFoundIt:
default:
for (i = 0; i < NUM_OF_PROBERS; i++)
{
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
if (candidates[i][j])
{
if (candidate == candidate_it)
{
float cf = mProbers[i]->GetConfidence(0);
float langConf = 1.0;
if (langDetectors[i][j])
langConf = langDetectors[i][j]->GetConfidence();
return cf * langConf;
}
candidate_it++;
}
}
}
/* Should not happen. */
return 0.0;
}
#ifdef DEBUG_chardet
void nsMBCSGroupProber::DumpStatus()
{
PRUint32 i;
float cf;
GetConfidence(0);
for (i = 0; i < NUM_OF_PROBERS; i++)
{
if (!mIsActive[i])
printf(" MBCS inactive: [%s] (confidence is too low).\r\n", ProberName[i]);
else
{
cf = mProbers[i]->GetConfidence(0);
printf(" MBCS %1.3f: [%s]\r\n", cf, ProberName[i]);
}
}
}
#endif
#ifdef DEBUG_jgmyers
void nsMBCSGroupProber::GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], PRUint32 &offset)
{
for (PRUint32 i = 0; i < NUM_OF_PROBERS; ++i) {
states[offset].name = ProberName[i];
states[offset].isActive = mIsActive[i];
states[offset].confidence = mIsActive[i] ? mProbers[i]->GetConfidence(0) : 0.0;
++offset;
}
}
#endif /* DEBUG_jgmyers */