mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
src: allow for nsCharSetProber to return several candidates.
No functional change yet because all probers still return 1 candidate. Yet now we add a GetCandidates() method to return a number of candidates. GetCharSetName(), GetLanguage() and GetConfidence() now take a parameter which is the candidate index (which must be below the return value of GetCandidates()). We can now consider that nsCharSetProber computes a couple (charset, language) and that the confidence is for this specific couple, not just the confidence for charset detection.
This commit is contained in:
parent
ea32980273
commit
2127f4fc0d
@ -75,13 +75,13 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
mLastChar[0] = aBuf[aLen-1];
|
||||
|
||||
if (mState == eDetecting)
|
||||
if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
if (mDistributionAnalyser.GotEnoughData() && GetConfidence(0) > SHORTCUT_THRESHOLD)
|
||||
mState = eFoundIt;
|
||||
|
||||
return mState;
|
||||
}
|
||||
|
||||
float nsBig5Prober::GetConfidence(void)
|
||||
float nsBig5Prober::GetConfidence(int candidate)
|
||||
{
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
|
||||
|
||||
@ -52,11 +52,12 @@ public:
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName() {return "BIG5";}
|
||||
const char* GetLanguage() {return "zh";}
|
||||
virtual int GetCandidates() { return 1; }
|
||||
const char* GetCharSetName(int) {return "BIG5";}
|
||||
const char* GetLanguage(int) {return "zh";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
float GetConfidence(int);
|
||||
void SetOpion() {}
|
||||
|
||||
protected:
|
||||
|
||||
@ -53,15 +53,16 @@ typedef enum {
|
||||
class nsCharSetProber {
|
||||
public:
|
||||
virtual ~nsCharSetProber() {}
|
||||
virtual const char* GetCharSetName() = 0;
|
||||
virtual const char* GetLanguage() = 0;
|
||||
virtual int GetCandidates() = 0;
|
||||
virtual const char* GetCharSetName(int candidate) = 0;
|
||||
virtual const char* GetLanguage(int candidate) = 0;
|
||||
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx) = 0;
|
||||
virtual bool DecodeToUnicode() {return false;}
|
||||
virtual nsProbingState GetState(void) = 0;
|
||||
virtual void Reset(void) = 0;
|
||||
virtual float GetConfidence(void) = 0;
|
||||
virtual float GetConfidence(int candidate) = 0;
|
||||
virtual void SetOpion() = 0;
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
|
||||
@ -85,13 +85,13 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
mLastChar[0] = aBuf[aLen-1];
|
||||
|
||||
if (mState == eDetecting)
|
||||
if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
if (mContextAnalyser.GotEnoughData() && GetConfidence(0) > SHORTCUT_THRESHOLD)
|
||||
mState = eFoundIt;
|
||||
|
||||
return mState;
|
||||
}
|
||||
|
||||
float nsEUCJPProber::GetConfidence(void)
|
||||
float nsEUCJPProber::GetConfidence(int candidate)
|
||||
{
|
||||
float contxtCf = mContextAnalyser.GetConfidence();
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
|
||||
@ -58,11 +58,12 @@ public:
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName() {return "EUC-JP";}
|
||||
const char* GetLanguage() {return "ja";}
|
||||
virtual int GetCandidates() { return 1; }
|
||||
const char* GetCharSetName(int) {return "EUC-JP";}
|
||||
const char* GetLanguage(int) {return "ja";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
float GetConfidence(int);
|
||||
void SetOpion() {}
|
||||
|
||||
protected:
|
||||
|
||||
@ -76,7 +76,7 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
mLastChar[0] = aBuf[aLen-1];
|
||||
|
||||
if (mState == eDetecting)
|
||||
if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
if (mDistributionAnalyser.GotEnoughData() && GetConfidence(0) > SHORTCUT_THRESHOLD)
|
||||
mState = eFoundIt;
|
||||
// else
|
||||
// mDistributionAnalyser.HandleData(aBuf, aLen);
|
||||
@ -84,7 +84,7 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
return mState;
|
||||
}
|
||||
|
||||
float nsEUCKRProber::GetConfidence(void)
|
||||
float nsEUCKRProber::GetConfidence(int candidate)
|
||||
{
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
|
||||
|
||||
@ -53,16 +53,17 @@ public:
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
virtual int GetCandidates() { return 1; }
|
||||
/* "Unified Hangul Code", also called "CP949" or "Windows-949" is a
|
||||
* superset of EUC-KR. Though not fully ok to return UHC here (a
|
||||
* separate prober would be better), it is acceptable, since many
|
||||
* Korean documents are actually created with this character set.
|
||||
*/
|
||||
const char* GetCharSetName() {return "UHC";}
|
||||
const char* GetLanguage() {return "ko";}
|
||||
const char* GetCharSetName(int) {return "UHC";}
|
||||
const char* GetLanguage(int) {return "ko";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
float GetConfidence(int);
|
||||
void SetOpion() {}
|
||||
|
||||
protected:
|
||||
|
||||
@ -76,7 +76,7 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
mLastChar[0] = aBuf[aLen-1];
|
||||
|
||||
if (mState == eDetecting)
|
||||
if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
if (mDistributionAnalyser.GotEnoughData() && GetConfidence(0) > SHORTCUT_THRESHOLD)
|
||||
mState = eFoundIt;
|
||||
// else
|
||||
// mDistributionAnalyser.HandleData(aBuf, aLen);
|
||||
@ -84,7 +84,7 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
return mState;
|
||||
}
|
||||
|
||||
float nsEUCTWProber::GetConfidence(void)
|
||||
float nsEUCTWProber::GetConfidence(int candidate)
|
||||
{
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
|
||||
|
||||
@ -52,11 +52,12 @@ public:
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName() {return "EUC-TW";}
|
||||
const char* GetLanguage() {return "zh";}
|
||||
virtual int GetCandidates() { return 1; }
|
||||
const char* GetCharSetName(int) {return "EUC-TW";}
|
||||
const char* GetLanguage(int) {return "zh";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
float GetConfidence(int);
|
||||
void SetOpion() {}
|
||||
|
||||
protected:
|
||||
|
||||
@ -52,11 +52,12 @@ public:
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName() {return mDetectedCharset;}
|
||||
const char* GetLanguage() {return NULL;}
|
||||
virtual int GetCandidates() { return 1; }
|
||||
const char* GetCharSetName(int) {return mDetectedCharset;}
|
||||
const char* GetLanguage(int) {return NULL;}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void){return (float)0.99;}
|
||||
float GetConfidence(int){return (float)0.99;}
|
||||
void SetOpion() {}
|
||||
|
||||
protected:
|
||||
|
||||
@ -81,7 +81,7 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
mLastChar[0] = aBuf[aLen-1];
|
||||
|
||||
if (mState == eDetecting)
|
||||
if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
if (mDistributionAnalyser.GotEnoughData() && GetConfidence(0) > SHORTCUT_THRESHOLD)
|
||||
mState = eFoundIt;
|
||||
// else
|
||||
// mDistributionAnalyser.HandleData(aBuf, aLen);
|
||||
@ -89,7 +89,7 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
return mState;
|
||||
}
|
||||
|
||||
float nsGB18030Prober::GetConfidence(void)
|
||||
float nsGB18030Prober::GetConfidence(int candidate)
|
||||
{
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
|
||||
|
||||
@ -54,11 +54,12 @@ public:
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName() {return "GB18030";}
|
||||
const char* GetLanguage() {return "zh";}
|
||||
virtual int GetCandidates() { return 1; }
|
||||
const char* GetCharSetName(int) {return "GB18030";}
|
||||
const char* GetLanguage(int) {return "zh";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
float GetConfidence(int candidate);
|
||||
void SetOpion() {}
|
||||
|
||||
protected:
|
||||
|
||||
@ -144,7 +144,7 @@ nsProbingState nsHebrewProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
}
|
||||
|
||||
// Make the decision: is it Logical or Visual?
|
||||
const char* nsHebrewProber::GetCharSetName()
|
||||
const char* nsHebrewProber::GetCharSetName(int candidate)
|
||||
{
|
||||
// If the final letter score distance is dominant enough, rely on it.
|
||||
PRInt32 finalsub = mFinalCharLogicalScore - mFinalCharVisualScore;
|
||||
@ -154,7 +154,7 @@ const char* nsHebrewProber::GetCharSetName()
|
||||
return VISUAL_HEBREW_NAME;
|
||||
|
||||
// It's not dominant enough, try to rely on the model scores instead.
|
||||
float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence();
|
||||
float modelsub = mLogicalProb->GetConfidence(0) - mVisualProb->GetConfidence(0);
|
||||
if (modelsub > MIN_MODEL_DISTANCE)
|
||||
return LOGICAL_HEBREW_NAME;
|
||||
if (modelsub < -(MIN_MODEL_DISTANCE))
|
||||
|
||||
@ -51,13 +51,14 @@ public:
|
||||
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
virtual const char *GetCharSetName();
|
||||
virtual const char *GetLanguage(void) { return "he"; }
|
||||
virtual int GetCandidates() { return 1; }
|
||||
virtual const char *GetCharSetName(int candidate);
|
||||
virtual const char *GetLanguage(int) { return "he"; }
|
||||
virtual void Reset(void);
|
||||
|
||||
virtual nsProbingState GetState(void);
|
||||
|
||||
virtual float GetConfidence(void) { return (float)0.0; }
|
||||
virtual float GetConfidence(int) { return (float)0.0; }
|
||||
virtual void SetOpion() {}
|
||||
|
||||
void SetModelProbers(nsCharSetProber *logicalPrb, nsCharSetProber *visualPrb)
|
||||
|
||||
@ -146,7 +146,7 @@ nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
return mState;
|
||||
}
|
||||
|
||||
float nsLatin1Prober::GetConfidence(void)
|
||||
float nsLatin1Prober::GetConfidence(int candidate)
|
||||
{
|
||||
if (mState == eNotMe)
|
||||
return 0.01f;
|
||||
@ -177,7 +177,7 @@ float nsLatin1Prober::GetConfidence(void)
|
||||
#ifdef DEBUG_chardet
|
||||
void nsLatin1Prober::DumpStatus()
|
||||
{
|
||||
printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
|
||||
printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(0), GetCharSetName());
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@ -52,11 +52,12 @@ public:
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName() {return "WINDOWS-1252";}
|
||||
const char* GetLanguage() {return NULL;}
|
||||
virtual int GetCandidates() { return 1; }
|
||||
const char* GetCharSetName(int) {return "WINDOWS-1252";}
|
||||
const char* GetLanguage(int) {return NULL;}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
float GetConfidence(int candidate);
|
||||
void SetOpion() {}
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
|
||||
@ -138,18 +138,18 @@ nsMBCSGroupProber::~nsMBCSGroupProber()
|
||||
}
|
||||
}
|
||||
|
||||
const char* nsMBCSGroupProber::GetCharSetName()
|
||||
const char* nsMBCSGroupProber::GetCharSetName(int candidate)
|
||||
{
|
||||
if (mBestGuess == -1)
|
||||
{
|
||||
GetConfidence();
|
||||
GetConfidence(0);
|
||||
if (mBestGuess == -1)
|
||||
mBestGuess = 0;
|
||||
}
|
||||
return mProbers[mBestGuess]->GetCharSetName();
|
||||
return mProbers[mBestGuess]->GetCharSetName(0);
|
||||
}
|
||||
|
||||
const char* nsMBCSGroupProber::GetLanguage(void)
|
||||
const char* nsMBCSGroupProber::GetLanguage(int candidate)
|
||||
{
|
||||
const char* maxLang = NULL;
|
||||
int maxLangIdx = -1;
|
||||
@ -158,7 +158,7 @@ const char* nsMBCSGroupProber::GetLanguage(void)
|
||||
if (mBestGuess == -1)
|
||||
return NULL;
|
||||
else
|
||||
maxLang = mProbers[mBestGuess]->GetLanguage();
|
||||
maxLang = mProbers[mBestGuess]->GetLanguage(0);
|
||||
|
||||
if (maxLang == NULL && mProbers[mBestGuess]->DecodeToUnicode())
|
||||
{
|
||||
@ -299,7 +299,7 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
return mState;
|
||||
}
|
||||
|
||||
float nsMBCSGroupProber::GetConfidence(void)
|
||||
float nsMBCSGroupProber::GetConfidence(int candidate)
|
||||
{
|
||||
PRUint32 i;
|
||||
float bestConf = 0.0, cf;
|
||||
@ -316,7 +316,7 @@ float nsMBCSGroupProber::GetConfidence(void)
|
||||
|
||||
if (!mIsActive[i])
|
||||
continue;
|
||||
cf = mProbers[i]->GetConfidence();
|
||||
cf = mProbers[i]->GetConfidence(0);
|
||||
|
||||
if (mProbers[i]->DecodeToUnicode())
|
||||
{
|
||||
@ -346,14 +346,14 @@ void nsMBCSGroupProber::DumpStatus()
|
||||
PRUint32 i;
|
||||
float cf;
|
||||
|
||||
GetConfidence();
|
||||
GetConfidence(0);
|
||||
for (i = 0; i < NUM_OF_PROBERS; i++)
|
||||
{
|
||||
if (!mIsActive[i])
|
||||
printf(" MBCS inactive: [%s] (confidence is too low).\r\n", ProberName[i]);
|
||||
else
|
||||
{
|
||||
cf = mProbers[i]->GetConfidence();
|
||||
cf = mProbers[i]->GetConfidence(0);
|
||||
printf(" MBCS %1.3f: [%s]\r\n", cf, ProberName[i]);
|
||||
}
|
||||
}
|
||||
@ -366,7 +366,7 @@ void nsMBCSGroupProber::GetDetectorState(nsUniversalDetector::DetectorState (&st
|
||||
for (PRUint32 i = 0; i < NUM_OF_PROBERS; ++i) {
|
||||
states[offset].name = ProberName[i];
|
||||
states[offset].isActive = mIsActive[i];
|
||||
states[offset].confidence = mIsActive[i] ? mProbers[i]->GetConfidence() : 0.0;
|
||||
states[offset].confidence = mIsActive[i] ? mProbers[i]->GetConfidence(0) : 0.0;
|
||||
++offset;
|
||||
}
|
||||
}
|
||||
|
||||
@ -57,11 +57,12 @@ public:
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName();
|
||||
const char* GetLanguage();
|
||||
int GetCandidates() { return 1; }
|
||||
const char* GetCharSetName(int candidate);
|
||||
const char* GetLanguage(int candidate);
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
float GetConfidence(int candidate);
|
||||
void SetOpion() {}
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
|
||||
@ -209,29 +209,29 @@ nsSBCSGroupProber::~nsSBCSGroupProber()
|
||||
}
|
||||
|
||||
|
||||
const char* nsSBCSGroupProber::GetCharSetName()
|
||||
const char* nsSBCSGroupProber::GetCharSetName(int candidate)
|
||||
{
|
||||
//if we have no answer yet
|
||||
if (mBestGuess == -1)
|
||||
{
|
||||
GetConfidence();
|
||||
GetConfidence(0);
|
||||
//no charset seems positive
|
||||
if (mBestGuess == -1)
|
||||
//we will use default.
|
||||
mBestGuess = 0;
|
||||
}
|
||||
return mProbers[mBestGuess]->GetCharSetName();
|
||||
return mProbers[mBestGuess]->GetCharSetName(0);
|
||||
}
|
||||
|
||||
const char* nsSBCSGroupProber::GetLanguage()
|
||||
const char* nsSBCSGroupProber::GetLanguage(int candidate)
|
||||
{
|
||||
if (mBestGuess == -1)
|
||||
{
|
||||
GetConfidence();
|
||||
GetConfidence(0);
|
||||
if (mBestGuess == -1)
|
||||
mBestGuess = 0;
|
||||
}
|
||||
return mProbers[mBestGuess]->GetLanguage();
|
||||
return mProbers[mBestGuess]->GetLanguage(0);
|
||||
}
|
||||
|
||||
void nsSBCSGroupProber::Reset(void)
|
||||
@ -303,7 +303,7 @@ done:
|
||||
return mState;
|
||||
}
|
||||
|
||||
float nsSBCSGroupProber::GetConfidence(void)
|
||||
float nsSBCSGroupProber::GetConfidence(int candidate)
|
||||
{
|
||||
PRUint32 i;
|
||||
float bestConf = 0.0, cf;
|
||||
@ -319,7 +319,7 @@ float nsSBCSGroupProber::GetConfidence(void)
|
||||
{
|
||||
if (!mIsActive[i])
|
||||
continue;
|
||||
cf = mProbers[i]->GetConfidence();
|
||||
cf = mProbers[i]->GetConfidence(0);
|
||||
if (bestConf < cf)
|
||||
{
|
||||
bestConf = cf;
|
||||
@ -336,16 +336,16 @@ void nsSBCSGroupProber::DumpStatus()
|
||||
PRUint32 i;
|
||||
float cf;
|
||||
|
||||
cf = GetConfidence();
|
||||
cf = GetConfidence(0);
|
||||
printf(" SBCS Group Prober --------begin status \r\n");
|
||||
for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
|
||||
{
|
||||
if (!mIsActive[i])
|
||||
printf(" inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName());
|
||||
printf(" inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName(0));
|
||||
else
|
||||
mProbers[i]->DumpStatus();
|
||||
}
|
||||
printf(" SBCS Group found best match [%s] confidence %f.\r\n",
|
||||
mProbers[mBestGuess]->GetCharSetName(), cf);
|
||||
mProbers[mBestGuess]->GetCharSetName(0), cf);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -50,11 +50,12 @@ public:
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName();
|
||||
const char* GetLanguage();
|
||||
virtual int GetCandidates() { return 1; }
|
||||
const char* GetCharSetName(int);
|
||||
const char* GetLanguage(int);
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
float GetConfidence(int);
|
||||
void SetOpion() {}
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
|
||||
@ -82,7 +82,7 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
|
||||
if (mState == eDetecting)
|
||||
if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD)
|
||||
{
|
||||
float cf = GetConfidence();
|
||||
float cf = GetConfidence(0);
|
||||
if (cf > POSITIVE_SHORTCUT_THRESHOLD)
|
||||
mState = eFoundIt;
|
||||
else if (cf < NEGATIVE_SHORTCUT_THRESHOLD)
|
||||
@ -106,7 +106,7 @@ void nsSingleByteCharSetProber::Reset(void)
|
||||
|
||||
//#define NEGATIVE_APPROACH 1
|
||||
|
||||
float nsSingleByteCharSetProber::GetConfidence(void)
|
||||
float nsSingleByteCharSetProber::GetConfidence(int candidate)
|
||||
{
|
||||
#ifdef NEGATIVE_APPROACH
|
||||
if (mTotalSeqs > 0)
|
||||
@ -140,23 +140,23 @@ float nsSingleByteCharSetProber::GetConfidence(void)
|
||||
#endif
|
||||
}
|
||||
|
||||
const char* nsSingleByteCharSetProber::GetCharSetName()
|
||||
const char* nsSingleByteCharSetProber::GetCharSetName(int candidate)
|
||||
{
|
||||
if (!mNameProber)
|
||||
return mModel->charsetName;
|
||||
return mNameProber->GetCharSetName();
|
||||
return mNameProber->GetCharSetName(0);
|
||||
}
|
||||
|
||||
const char* nsSingleByteCharSetProber::GetLanguage()
|
||||
const char* nsSingleByteCharSetProber::GetLanguage(int candidate)
|
||||
{
|
||||
if (!mNameProber)
|
||||
return mModel->langName;
|
||||
return mNameProber->GetLanguage();
|
||||
return mNameProber->GetLanguage(0);
|
||||
}
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
void nsSingleByteCharSetProber::DumpStatus()
|
||||
{
|
||||
printf(" SBCS: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
|
||||
printf(" SBCS: %1.3f [%s]\r\n", GetConfidence(0), GetCharSetName(0));
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -86,14 +86,15 @@ public:
|
||||
nsSingleByteCharSetProber(const SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
|
||||
:mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }
|
||||
|
||||
virtual const char* GetCharSetName();
|
||||
virtual const char* GetLanguage();
|
||||
virtual int GetCandidates() { return 1; }
|
||||
virtual const char* GetCharSetName(int candidate);
|
||||
virtual const char* GetLanguage(int candidate);
|
||||
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
virtual nsProbingState GetState(void) {return mState;}
|
||||
virtual void Reset(void);
|
||||
virtual float GetConfidence(void);
|
||||
virtual float GetConfidence(int candidate);
|
||||
virtual void SetOpion() {}
|
||||
|
||||
// This feature is not implemented yet. any current language model
|
||||
|
||||
@ -84,13 +84,13 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
mLastChar[0] = aBuf[aLen-1];
|
||||
|
||||
if (mState == eDetecting)
|
||||
if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
if (mContextAnalyser.GotEnoughData() && GetConfidence(0) > SHORTCUT_THRESHOLD)
|
||||
mState = eFoundIt;
|
||||
|
||||
return mState;
|
||||
}
|
||||
|
||||
float nsSJISProber::GetConfidence(void)
|
||||
float nsSJISProber::GetConfidence(int candidate)
|
||||
{
|
||||
float contxtCf = mContextAnalyser.GetConfidence();
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
|
||||
@ -59,11 +59,12 @@ public:
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName() {return "SHIFT_JIS";}
|
||||
const char* GetLanguage() {return "ja";}
|
||||
virtual int GetCandidates() { return 1; }
|
||||
const char* GetCharSetName(int) {return "SHIFT_JIS";}
|
||||
const char* GetLanguage(int) {return "ja";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
float GetConfidence(int candidate);
|
||||
void SetOpion() {}
|
||||
|
||||
protected:
|
||||
|
||||
@ -88,14 +88,14 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
}
|
||||
|
||||
if (mState == eDetecting)
|
||||
if (GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
if (GetConfidence(0) > SHORTCUT_THRESHOLD)
|
||||
mState = eFoundIt;
|
||||
return mState;
|
||||
}
|
||||
|
||||
#define ONE_CHAR_PROB (float)0.50
|
||||
|
||||
float nsUTF8Prober::GetConfidence(void)
|
||||
float nsUTF8Prober::GetConfidence(int candidate)
|
||||
{
|
||||
float unlike = (float)0.99;
|
||||
|
||||
|
||||
@ -52,11 +52,12 @@ public:
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName() {return "UTF-8";}
|
||||
const char* GetLanguage() {return NULL;}
|
||||
virtual int GetCandidates() { return 1; }
|
||||
const char* GetCharSetName(int) {return "UTF-8";}
|
||||
const char* GetLanguage(int) {return NULL;}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
float GetConfidence(int candidate);
|
||||
void SetOpion() {}
|
||||
|
||||
virtual bool DecodeToUnicode() {return true;}
|
||||
|
||||
@ -253,8 +253,8 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
if (st == eFoundIt)
|
||||
{
|
||||
mDone = PR_TRUE;
|
||||
mDetectedCharset = mEscCharSetProber->GetCharSetName();
|
||||
mDetectedConfidence = mEscCharSetProber->GetConfidence();
|
||||
mDetectedCharset = mEscCharSetProber->GetCharSetName(0);
|
||||
mDetectedConfidence = mEscCharSetProber->GetConfidence(0);
|
||||
}
|
||||
break;
|
||||
case eHighbyte:
|
||||
@ -266,9 +266,9 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
if (st == eFoundIt)
|
||||
{
|
||||
mDone = PR_TRUE;
|
||||
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
|
||||
mDetectedLanguage = mCharSetProbers[i]->GetLanguage();
|
||||
mDetectedConfidence = mCharSetProbers[i]->GetConfidence();
|
||||
mDetectedCharset = mCharSetProbers[i]->GetCharSetName(0);
|
||||
mDetectedLanguage = mCharSetProbers[i]->GetLanguage(0);
|
||||
mDetectedConfidence = mCharSetProbers[i]->GetConfidence(0);
|
||||
return NS_OK;
|
||||
}
|
||||
}
|
||||
@ -335,12 +335,12 @@ void nsUniversalDetector::DataEnd()
|
||||
{
|
||||
if (mCharSetProbers[i])
|
||||
{
|
||||
proberConfidence = mCharSetProbers[i]->GetConfidence();
|
||||
proberConfidence = mCharSetProbers[i]->GetConfidence(0);
|
||||
|
||||
if (proberConfidence > MINIMUM_THRESHOLD)
|
||||
/* Only report what we are confident in. */
|
||||
Report(mCharSetProbers[i]->GetCharSetName(),
|
||||
mCharSetProbers[i]->GetLanguage(),
|
||||
Report(mCharSetProbers[i]->GetCharSetName(0),
|
||||
mCharSetProbers[i]->GetLanguage(0),
|
||||
proberConfidence);
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user