mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-08 01:36:41 +08:00
src: do not shortcut UTF-8 detection too early.
I had the case with the Czech test which was considered as Irish after being shortcutted far too early after only 16 characters. Confidence values was just barely above 0.5 for Irish (and barely below for Czech). By adding a threshold (at least 256 characters), we give a bit of relevant data to the engine to actually make an informed decision. By then, the Czech detection was at more than 0.7, whereas the Irish one at 0.6.
This commit is contained in:
parent
2a16ab2310
commit
b00c85a6a6
@ -45,6 +45,8 @@ void nsUTF8Prober::Reset(void)
|
||||
currentCodePoint = 0;
|
||||
}
|
||||
|
||||
#define ENOUGH_CHAR_THRESHOLD 256
|
||||
|
||||
nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx)
|
||||
@ -88,7 +90,7 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
}
|
||||
|
||||
if (mState == eDetecting)
|
||||
if (GetConfidence(0) > SHORTCUT_THRESHOLD)
|
||||
if (mNumOfMBChar > ENOUGH_CHAR_THRESHOLD && GetConfidence(0) > SHORTCUT_THRESHOLD)
|
||||
mState = eFoundIt;
|
||||
return mState;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user