mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
Using the generic language detector in UTF-8 detection.
Now the UTF-8 prober would not only detect valid UTF-8, but would also
detect the most probable language. Using the data generated 2 commits
away, this works very well.
This is still basic and will require even more improvements. In
particular, now the nsUTF8Prober should return an array of ("UTF-8",
language) couple candidate. And nsMBCSGroupProber should itself forward
these candidates as well as other candidates from other multi-byte
detectors. This way, the public-facing API would get more probable
candidates, in case the algorithm is slightly wrong.
Also the UTF-8 confidence is currently stupidly high as soon as we
consider it to be right. We should likely weigh it with language
detection (in particular, if no language is detected, this should
severely weigh down UTF-8 detection; not to 0, but high enough to be a
fallback in case no other encoding+lang is valid and low enough to give
chances to other good candidate couples.
This commit is contained in:
parent
dac7cbd30f
commit
5257fc1abf
@ -44,7 +44,9 @@ void nsBig5Prober::Reset(void)
|
||||
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||
}
|
||||
|
||||
nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx)
|
||||
{
|
||||
PRUint32 codingState;
|
||||
|
||||
|
||||
@ -49,7 +49,9 @@ public:
|
||||
{mCodingSM = new nsCodingStateMachine(&Big5SMModel);
|
||||
Reset();}
|
||||
virtual ~nsBig5Prober(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName() {return "BIG5";}
|
||||
const char* GetLanguage() {return "zh";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
|
||||
@ -55,7 +55,10 @@ public:
|
||||
virtual ~nsCharSetProber() {}
|
||||
virtual const char* GetCharSetName() = 0;
|
||||
virtual const char* GetLanguage() = 0;
|
||||
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0;
|
||||
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx) = 0;
|
||||
virtual bool DecodeToUnicode() {return false;}
|
||||
virtual nsProbingState GetState(void) = 0;
|
||||
virtual void Reset(void) = 0;
|
||||
virtual float GetConfidence(void) = 0;
|
||||
|
||||
@ -50,7 +50,9 @@ void nsEUCJPProber::Reset(void)
|
||||
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||
}
|
||||
|
||||
nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx)
|
||||
{
|
||||
PRUint32 codingState;
|
||||
|
||||
|
||||
@ -55,7 +55,9 @@ public:
|
||||
{mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
|
||||
Reset();}
|
||||
virtual ~nsEUCJPProber(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName() {return "EUC-JP";}
|
||||
const char* GetLanguage() {return "ja";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
|
||||
@ -45,7 +45,9 @@ void nsEUCKRProber::Reset(void)
|
||||
//mContextAnalyser.Reset();
|
||||
}
|
||||
|
||||
nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx)
|
||||
{
|
||||
PRUint32 codingState;
|
||||
|
||||
|
||||
@ -50,7 +50,9 @@ public:
|
||||
Reset();
|
||||
}
|
||||
virtual ~nsEUCKRProber(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
/* "Unified Hangul Code", also called "CP949" or "Windows-949" is a
|
||||
* superset of EUC-KR. Though not fully ok to return UHC here (a
|
||||
* separate prober would be better), it is acceptable, since many
|
||||
|
||||
@ -45,7 +45,9 @@ void nsEUCTWProber::Reset(void)
|
||||
//mContextAnalyser.Reset();
|
||||
}
|
||||
|
||||
nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx)
|
||||
{
|
||||
PRUint32 codingState;
|
||||
|
||||
|
||||
@ -49,7 +49,9 @@ public:
|
||||
{mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
|
||||
Reset();}
|
||||
virtual ~nsEUCTWProber(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName() {return "EUC-TW";}
|
||||
const char* GetLanguage() {return "zh";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
|
||||
@ -73,7 +73,9 @@ void nsEscCharSetProber::Reset(void)
|
||||
mDetectedCharset = nsnull;
|
||||
}
|
||||
|
||||
nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx)
|
||||
{
|
||||
PRUint32 codingState;
|
||||
PRInt32 j;
|
||||
|
||||
@ -49,7 +49,9 @@ class nsEscCharSetProber: public nsCharSetProber {
|
||||
public:
|
||||
nsEscCharSetProber(PRUint32 aLanguageFilter);
|
||||
virtual ~nsEscCharSetProber(void);
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName() {return mDetectedCharset;}
|
||||
const char* GetLanguage() {return NULL;}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
|
||||
@ -50,7 +50,9 @@ void nsGB18030Prober::Reset(void)
|
||||
//mContextAnalyser.Reset();
|
||||
}
|
||||
|
||||
nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx)
|
||||
{
|
||||
PRUint32 codingState;
|
||||
|
||||
|
||||
@ -51,7 +51,9 @@ public:
|
||||
{mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
|
||||
Reset();}
|
||||
virtual ~nsGB18030Prober(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName() {return "GB18030";}
|
||||
const char* GetLanguage() {return "zh";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
|
||||
@ -106,7 +106,9 @@ PRBool nsHebrewProber::isNonFinal(char c)
|
||||
* The input buffer should not contain any white spaces that are not (' ')
|
||||
* or any low-ascii punctuation marks.
|
||||
*/
|
||||
nsProbingState nsHebrewProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
nsProbingState nsHebrewProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx)
|
||||
{
|
||||
// Both model probers say it's not them. No reason to continue.
|
||||
if (GetState() == eNotMe)
|
||||
|
||||
@ -48,7 +48,9 @@ public:
|
||||
nsHebrewProber(void) :mLogicalProb(0), mVisualProb(0) { Reset(); }
|
||||
|
||||
virtual ~nsHebrewProber(void) {}
|
||||
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
virtual const char *GetCharSetName();
|
||||
virtual const char *GetLanguage(void) { return "he"; }
|
||||
virtual void Reset(void);
|
||||
|
||||
@ -114,7 +114,9 @@ void nsLatin1Prober::Reset(void)
|
||||
}
|
||||
|
||||
|
||||
nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx)
|
||||
{
|
||||
char *newBuf1 = 0;
|
||||
PRUint32 newLen1 = 0;
|
||||
|
||||
@ -49,7 +49,9 @@ class nsLatin1Prober: public nsCharSetProber {
|
||||
public:
|
||||
nsLatin1Prober(void){Reset();}
|
||||
virtual ~nsLatin1Prober(void){}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName() {return "WINDOWS-1252";}
|
||||
const char* GetLanguage() {return NULL;}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
|
||||
@ -58,7 +58,12 @@ const char *ProberName[] =
|
||||
nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
|
||||
{
|
||||
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
||||
mProbers[i] = nsnull;
|
||||
{
|
||||
mProbers[i] = nsnull;
|
||||
codePointBuffer[i] = nsnull;
|
||||
codePointBufferSize[i] = 0;
|
||||
codePointBufferIdx[i] = 0;
|
||||
}
|
||||
|
||||
mProbers[0] = new nsUTF8Prober();
|
||||
if (aLanguageFilter & NS_FILTER_JAPANESE)
|
||||
@ -75,6 +80,24 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
|
||||
mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
|
||||
mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
|
||||
}
|
||||
|
||||
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
||||
{
|
||||
if (mProbers[i]->DecodeToUnicode())
|
||||
{
|
||||
langDetectors[i][0] = new nsLanguageDetector(&FrenchModel);
|
||||
langDetectors[i][1] = new nsLanguageDetector(&ItalianModel);
|
||||
langDetectors[i][2] = new nsLanguageDetector(&DanishModel);
|
||||
langDetectors[i][3] = new nsLanguageDetector(&GermanModel);
|
||||
langDetectors[i][4] = new nsLanguageDetector(&ArabicModel);
|
||||
langDetectors[i][5] = new nsLanguageDetector(&SpanishModel);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
langDetectors[i][j] = nsnull;
|
||||
}
|
||||
}
|
||||
Reset();
|
||||
}
|
||||
|
||||
@ -83,6 +106,13 @@ nsMBCSGroupProber::~nsMBCSGroupProber()
|
||||
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
||||
{
|
||||
delete mProbers[i];
|
||||
|
||||
if (codePointBufferSize[i] != 0)
|
||||
delete [] codePointBuffer[i];
|
||||
|
||||
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
if (langDetectors[i][j])
|
||||
delete langDetectors[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
@ -99,17 +129,35 @@ const char* nsMBCSGroupProber::GetCharSetName()
|
||||
|
||||
const char* nsMBCSGroupProber::GetLanguage(void)
|
||||
{
|
||||
const char* maxLang = NULL;
|
||||
int maxLangIdx = -1;
|
||||
float maxConfidence = 0.0;
|
||||
|
||||
if (mBestGuess == -1)
|
||||
{
|
||||
GetConfidence();
|
||||
}
|
||||
if (mBestGuess == -1)
|
||||
return NULL;
|
||||
return NULL;
|
||||
else
|
||||
return mProbers[mBestGuess]->GetLanguage();
|
||||
maxLang = mProbers[mBestGuess]->GetLanguage();
|
||||
|
||||
if (maxLang == NULL && mProbers[mBestGuess]->DecodeToUnicode())
|
||||
{
|
||||
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
{
|
||||
float conf = langDetectors[mBestGuess][j]->GetConfidence();
|
||||
|
||||
if (conf > maxConfidence)
|
||||
{
|
||||
maxLangIdx = j;
|
||||
maxConfidence = conf;
|
||||
}
|
||||
}
|
||||
if (maxLangIdx != -1)
|
||||
maxLang = langDetectors[mBestGuess][maxLangIdx]->GetLanguage();
|
||||
}
|
||||
|
||||
return maxLang;
|
||||
}
|
||||
|
||||
void nsMBCSGroupProber::Reset(void)
|
||||
void nsMBCSGroupProber::Reset(void)
|
||||
{
|
||||
mActiveNum = 0;
|
||||
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
||||
@ -119,6 +167,13 @@ void nsMBCSGroupProber::Reset(void)
|
||||
mProbers[i]->Reset();
|
||||
mIsActive[i] = PR_TRUE;
|
||||
++mActiveNum;
|
||||
|
||||
if (codePointBufferSize[i] == 0 && mProbers[i]->DecodeToUnicode())
|
||||
{
|
||||
codePointBufferSize[i] = 1024;
|
||||
codePointBuffer[i] = new int[codePointBufferSize[i]];
|
||||
}
|
||||
codePointBufferIdx[i] = 0;
|
||||
}
|
||||
else
|
||||
mIsActive[i] = PR_FALSE;
|
||||
@ -128,7 +183,9 @@ void nsMBCSGroupProber::Reset(void)
|
||||
mKeepNext = 0;
|
||||
}
|
||||
|
||||
nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** cpBuffer,
|
||||
int* cpBufferIdx)
|
||||
{
|
||||
nsProbingState st;
|
||||
PRUint32 start = 0;
|
||||
@ -151,7 +208,20 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
{
|
||||
if (!mIsActive[i])
|
||||
continue;
|
||||
st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start);
|
||||
|
||||
if (codePointBuffer[i])
|
||||
st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start,
|
||||
&(codePointBuffer[i]), &(codePointBufferIdx[i]));
|
||||
else
|
||||
st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start, NULL, NULL);
|
||||
|
||||
if (codePointBufferIdx[i] > 0 && codePointBuffer[i])
|
||||
{
|
||||
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]);
|
||||
codePointBufferIdx[i] = 0;
|
||||
}
|
||||
|
||||
if (st == eFoundIt)
|
||||
{
|
||||
mBestGuess = i;
|
||||
@ -161,6 +231,12 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
||||
if (codePointBuffer[i])
|
||||
codePointBuffer[i][(codePointBufferIdx[i])++] = aBuf[pos];
|
||||
}
|
||||
}
|
||||
|
||||
if (keepNext) {
|
||||
@ -168,7 +244,20 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
{
|
||||
if (!mIsActive[i])
|
||||
continue;
|
||||
st = mProbers[i]->HandleData(aBuf + start, aLen - start);
|
||||
|
||||
if (codePointBuffer[i])
|
||||
st = mProbers[i]->HandleData(aBuf + start, aLen - start,
|
||||
&(codePointBuffer[i]), &(codePointBufferIdx[i]));
|
||||
else
|
||||
st = mProbers[i]->HandleData(aBuf + start, aLen - start, NULL, NULL);
|
||||
|
||||
if (codePointBufferIdx[i] > 0 && codePointBuffer[i])
|
||||
{
|
||||
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]);
|
||||
codePointBufferIdx[i] = 0;
|
||||
}
|
||||
|
||||
if (st == eFoundIt)
|
||||
{
|
||||
mBestGuess = i;
|
||||
|
||||
@ -48,12 +48,15 @@
|
||||
#include "nsEUCTWProber.h"
|
||||
|
||||
#define NUM_OF_PROBERS 7
|
||||
#define NUM_OF_LANGUAGES 6
|
||||
|
||||
class nsMBCSGroupProber: public nsCharSetProber {
|
||||
public:
|
||||
nsMBCSGroupProber(PRUint32 aLanguageFilter);
|
||||
virtual ~nsMBCSGroupProber();
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName();
|
||||
const char* GetLanguage();
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
@ -75,6 +78,12 @@ protected:
|
||||
PRInt32 mBestGuess;
|
||||
PRUint32 mActiveNum;
|
||||
PRUint32 mKeepNext;
|
||||
|
||||
int *codePointBuffer[NUM_OF_PROBERS];
|
||||
int codePointBufferSize[NUM_OF_PROBERS];
|
||||
int codePointBufferIdx[NUM_OF_PROBERS];
|
||||
|
||||
nsLanguageDetector *langDetectors[NUM_OF_PROBERS][NUM_OF_LANGUAGES];
|
||||
};
|
||||
|
||||
#endif /* nsMBCSGroupProber_h__ */
|
||||
|
||||
@ -253,7 +253,9 @@ void nsSBCSGroupProber::Reset(void)
|
||||
}
|
||||
|
||||
|
||||
nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx)
|
||||
{
|
||||
nsProbingState st;
|
||||
PRUint32 i;
|
||||
@ -276,7 +278,7 @@ nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
{
|
||||
if (!mIsActive[i])
|
||||
continue;
|
||||
st = mProbers[i]->HandleData(newBuf1, newLen1);
|
||||
st = mProbers[i]->HandleData(newBuf1, newLen1, codePointBuffer, codePointBufferIdx);
|
||||
if (st == eFoundIt)
|
||||
{
|
||||
mBestGuess = i;
|
||||
|
||||
@ -47,7 +47,9 @@ class nsSBCSGroupProber: public nsCharSetProber {
|
||||
public:
|
||||
nsSBCSGroupProber();
|
||||
virtual ~nsSBCSGroupProber();
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName();
|
||||
const char* GetLanguage();
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
|
||||
@ -38,7 +38,9 @@
|
||||
#include <stdio.h>
|
||||
#include "nsSBCharSetProber.h"
|
||||
|
||||
nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx)
|
||||
{
|
||||
unsigned char order;
|
||||
|
||||
|
||||
@ -88,7 +88,9 @@ public:
|
||||
|
||||
virtual const char* GetCharSetName();
|
||||
virtual const char* GetLanguage();
|
||||
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
virtual nsProbingState GetState(void) {return mState;}
|
||||
virtual void Reset(void);
|
||||
virtual float GetConfidence(void);
|
||||
|
||||
@ -50,7 +50,9 @@ void nsSJISProber::Reset(void)
|
||||
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||
}
|
||||
|
||||
nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx)
|
||||
{
|
||||
PRUint32 codingState;
|
||||
|
||||
|
||||
@ -56,7 +56,9 @@ public:
|
||||
{mCodingSM = new nsCodingStateMachine(&SJISSMModel);
|
||||
Reset();}
|
||||
virtual ~nsSJISProber(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName() {return "SHIFT_JIS";}
|
||||
const char* GetLanguage() {return "ja";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
|
||||
@ -42,9 +42,12 @@ void nsUTF8Prober::Reset(void)
|
||||
mCodingSM->Reset();
|
||||
mNumOfMBChar = 0;
|
||||
mState = eDetecting;
|
||||
currentCodePoint = 0;
|
||||
}
|
||||
|
||||
nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx)
|
||||
{
|
||||
PRUint32 codingState;
|
||||
|
||||
@ -59,7 +62,28 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
if (codingState == eStart)
|
||||
{
|
||||
if (mCodingSM->GetCurrentCharLen() >= 2)
|
||||
{
|
||||
mNumOfMBChar++;
|
||||
|
||||
currentCodePoint = ((0xff & aBuf[i]) & 0x3fu) | (currentCodePoint << 6);
|
||||
if (mCodingSM->GetCurrentCharLen() == 2)
|
||||
currentCodePoint &= 0x7ff;
|
||||
else if (mCodingSM->GetCurrentCharLen() == 3)
|
||||
currentCodePoint &= 0xffff;
|
||||
else
|
||||
currentCodePoint &= 0x1fffff;
|
||||
}
|
||||
else
|
||||
{
|
||||
currentCodePoint = 0xff & (char) aBuf[i];
|
||||
}
|
||||
|
||||
(*codePointBuffer)[(*codePointBufferIdx)++] = currentCodePoint;
|
||||
currentCodePoint = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
currentCodePoint = ((0xff & aBuf[i]) & 0x3fu) | (currentCodePoint << 6);
|
||||
}
|
||||
}
|
||||
|
||||
@ -84,4 +108,3 @@ float nsUTF8Prober::GetConfidence(void)
|
||||
else
|
||||
return (float)0.99;
|
||||
}
|
||||
|
||||
|
||||
@ -41,6 +41,7 @@
|
||||
#include <cstddef>
|
||||
#include "nsCharSetProber.h"
|
||||
#include "nsCodingStateMachine.h"
|
||||
#include "nsLanguageDetector.h"
|
||||
|
||||
class nsUTF8Prober: public nsCharSetProber {
|
||||
public:
|
||||
@ -48,7 +49,9 @@ public:
|
||||
mCodingSM = new nsCodingStateMachine(&UTF8SMModel);
|
||||
Reset(); }
|
||||
virtual ~nsUTF8Prober(){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
const char* GetCharSetName() {return "UTF-8";}
|
||||
const char* GetLanguage() {return NULL;}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
@ -56,11 +59,14 @@ public:
|
||||
float GetConfidence(void);
|
||||
void SetOpion() {}
|
||||
|
||||
virtual bool DecodeToUnicode() {return true;}
|
||||
|
||||
protected:
|
||||
nsCodingStateMachine* mCodingSM;
|
||||
nsProbingState mState;
|
||||
PRUint32 mNumOfMBChar;
|
||||
|
||||
int currentCodePoint;
|
||||
};
|
||||
|
||||
#endif /* nsUTF8Prober_h__ */
|
||||
|
||||
|
||||
@ -55,6 +55,8 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
|
||||
|
||||
mStart = PR_TRUE;
|
||||
mDetectedCharset = nsnull;
|
||||
mDetectedLanguage = nsnull;
|
||||
mDetectedConfidence = 0.0;
|
||||
mGotData = PR_FALSE;
|
||||
mInputState = ePureAscii;
|
||||
mLastChar = '\0';
|
||||
@ -83,6 +85,8 @@ nsUniversalDetector::Reset()
|
||||
|
||||
mStart = PR_TRUE;
|
||||
mDetectedCharset = nsnull;
|
||||
mDetectedLanguage = nsnull;
|
||||
mDetectedConfidence = 0.0;
|
||||
mGotData = PR_FALSE;
|
||||
mInputState = ePureAscii;
|
||||
mLastChar = '\0';
|
||||
@ -118,13 +122,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
{
|
||||
case '\xEF':
|
||||
if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
|
||||
{
|
||||
/* EF BB BF: UTF-8 encoded BOM. */
|
||||
mDetectedCharset = "UTF-8";
|
||||
mDetectedConfidence = 0.99;
|
||||
}
|
||||
break;
|
||||
case '\xFE':
|
||||
if ('\xFF' == aBuf[1])
|
||||
{
|
||||
/* FE FF: UTF-16, big endian BOM. */
|
||||
mDetectedCharset = "UTF-16";
|
||||
mDetectedConfidence = 0.99;
|
||||
}
|
||||
break;
|
||||
case '\xFF':
|
||||
if ('\xFE' == aBuf[1])
|
||||
@ -135,11 +145,13 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
{
|
||||
/* FF FE 00 00: UTF-32 (LE). */
|
||||
mDetectedCharset = "UTF-32";
|
||||
mDetectedConfidence = 0.99;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* FF FE: UTF-16, little endian BOM. */
|
||||
mDetectedCharset = "UTF-16";
|
||||
mDetectedConfidence = 0.99;
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -151,6 +163,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
{
|
||||
/* 00 00 FE FF: UTF-32 (BE). */
|
||||
mDetectedCharset = "UTF-32";
|
||||
mDetectedConfidence = 0.99;
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -236,11 +249,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
if (nsnull == mEscCharSetProber)
|
||||
return NS_ERROR_OUT_OF_MEMORY;
|
||||
}
|
||||
st = mEscCharSetProber->HandleData(aBuf, aLen);
|
||||
st = mEscCharSetProber->HandleData(aBuf, aLen, NULL, NULL);
|
||||
if (st == eFoundIt)
|
||||
{
|
||||
mDone = PR_TRUE;
|
||||
mDetectedCharset = mEscCharSetProber->GetCharSetName();
|
||||
mDetectedConfidence = mEscCharSetProber->GetConfidence();
|
||||
}
|
||||
break;
|
||||
case eHighbyte:
|
||||
@ -248,11 +262,13 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
{
|
||||
if (mCharSetProbers[i])
|
||||
{
|
||||
st = mCharSetProbers[i]->HandleData(aBuf, aLen);
|
||||
st = mCharSetProbers[i]->HandleData(aBuf, aLen, NULL, NULL);
|
||||
if (st == eFoundIt)
|
||||
{
|
||||
mDone = PR_TRUE;
|
||||
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
|
||||
mDetectedLanguage = mCharSetProbers[i]->GetLanguage();
|
||||
mDetectedConfidence = mCharSetProbers[i]->GetConfidence();
|
||||
return NS_OK;
|
||||
}
|
||||
}
|
||||
@ -305,7 +321,7 @@ void nsUniversalDetector::DataEnd()
|
||||
* when finding them.
|
||||
*/
|
||||
mDone = PR_TRUE;
|
||||
Report(mDetectedCharset, NULL, 1.0);
|
||||
Report(mDetectedCharset, mDetectedLanguage, mDetectedConfidence);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@ -81,6 +81,8 @@ protected:
|
||||
PRBool mGotData;
|
||||
char mLastChar;
|
||||
const char * mDetectedCharset;
|
||||
const char * mDetectedLanguage;
|
||||
float mDetectedConfidence;
|
||||
PRInt32 mBestGuess;
|
||||
PRUint32 mLanguageFilter;
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user