diff --git a/src/nsLanguageDetector.cpp b/src/nsLanguageDetector.cpp index 0b5baa1..8097028 100644 --- a/src/nsLanguageDetector.cpp +++ b/src/nsLanguageDetector.cpp @@ -212,7 +212,7 @@ float nsLanguageDetector::GetConfidence(void) //float neutralSeqs = mSeqCounters[LANG_NEUTRAL_CAT]; float negativeSeqs = mSeqCounters[LANG_NEGATIVE_CAT]; - r = (positiveSeqs + probableSeqs / 4 - negativeSeqs * 2) / mTotalSeqs; + r = (positiveSeqs + probableSeqs / 4 - negativeSeqs * 4) / mTotalSeqs / mModel->mTypicalPositiveRatio; /* The more characters outside the expected characters * (proportionnaly to the size of the text), the less confident we * become in the current language. diff --git a/src/nsSBCharSetProber.cpp b/src/nsSBCharSetProber.cpp index 2dfe830..a51cea1 100644 --- a/src/nsSBCharSetProber.cpp +++ b/src/nsSBCharSetProber.cpp @@ -48,11 +48,8 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 { order = mModel->charToOrderMap[(unsigned char)aBuf[i]]; - if (order < SYMBOL_CAT_ORDER) - { - mTotalChar++; - } - else if (order == ILL) + mTotalChar++; + if (order == ILL) { /* When encountering an illegal codepoint, no need * to continue analyzing data. */ @@ -63,7 +60,7 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 { mCtrlChar++; } - if (order < mModel->freqCharCount) + else if (order < mModel->freqCharCount) { mFreqChar++; @@ -75,6 +72,21 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 else // reverse the order of the letters in the lookup ++(mSeqCounters[mModel->precedenceMatrix[order*mModel->freqCharCount+mLastOrder]]); } + else if (mLastOrder < SYMBOL_CAT_ORDER) + { + mSeqCounters[NEGATIVE_CAT]++; + mTotalSeqs++; + } + } + else if (order < SYMBOL_CAT_ORDER) + { + mOutChar++; + + if (mLastOrder < SYMBOL_CAT_ORDER) + { + mTotalSeqs++; + mSeqCounters[NEGATIVE_CAT]++; + } } mLastOrder = order; } @@ -92,7 +104,7 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 return mState; } -void nsSingleByteCharSetProber::Reset(void) +void nsSingleByteCharSetProber::Reset(void) { mState = eDetecting; mLastOrder = 255; @@ -101,7 +113,8 @@ void nsSingleByteCharSetProber::Reset(void) mTotalSeqs = 0; mTotalChar = 0; mCtrlChar = 0; - mFreqChar = 0; + mFreqChar = 0; + mOutChar = 0; } //#define NEGATIVE_APPROACH 1 @@ -117,23 +130,14 @@ float nsSingleByteCharSetProber::GetConfidence(int candidate) float r; if (mTotalSeqs > 0) { - r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio; - /* Multiply by a ratio of positive sequences per characters. - * This would help in particular to distinguish close winners. - * Indeed if you add a letter, you'd expect the positive sequence count - * to increase as well. If it doesn't, it may mean that this new codepoint - * may not have been a letter, but instead a symbol (or some other - * character). This could make the difference between very closely related - * charsets used for the same language. - */ - r = r * (mSeqCounters[POSITIVE_CAT] + (float) mSeqCounters[PROBABLE_CAT] / 4) / mTotalChar; - /* The more control characters (proportionnaly to the size of the text), the - * less confident we become in the current charset. - */ - r = r * ((float) mTotalChar - mCtrlChar) / mTotalChar; - r = r*mFreqChar/mTotalChar; - if (r >= (float)1.00) - r = (float)0.99; + float positiveSeqs = mSeqCounters[POSITIVE_CAT]; + float probableSeqs = mSeqCounters[PROBABLE_CAT]; + float negativeSeqs = mSeqCounters[NEGATIVE_CAT]; + + r = (positiveSeqs + probableSeqs / 4 - negativeSeqs * 4) / mTotalSeqs / mModel->mTypicalPositiveRatio; + r = r * (mTotalChar - mOutChar - mCtrlChar) / mTotalChar; + r = r * mFreqChar / mTotalChar; + return r; } return (float)0.01; diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index f52bf2c..f6173a8 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -124,6 +124,7 @@ protected: PRUint32 mCtrlChar; //characters that fall in our sampling range PRUint32 mFreqChar; + PRUint32 mOutChar; // Optional auxiliary prober for name decision. created and destroyed by the GroupProber nsCharSetProber* mNameProber;