mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
Bug 101204 - different results with different chunk sizes.
ASCII and ISO-8859-1 should not be detected in nsUniversalDetector::HandleData() but in nsUniversalDetector::DataEnd() instead. Otherwise it creates an unwanted shortcut from the first call to uchardet_handle_data() if the input is broken into several pieces and if the first chunk happens to be ASCII (or ASCII + NBSP).
This commit is contained in:
parent
50743e16f8
commit
98bf4d73fd
@ -242,16 +242,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
mDone = PR_TRUE;
|
||||
mDetectedCharset = mEscCharSetProber->GetCharSetName();
|
||||
}
|
||||
else if (mNbspFound)
|
||||
{
|
||||
mDetectedCharset = "ISO-8859-1";
|
||||
}
|
||||
else
|
||||
{
|
||||
/* ASCII with the ESC character (or the sequence "~{") is still
|
||||
* ASCII until proven otherwise. */
|
||||
mDetectedCharset = "ASCII";
|
||||
}
|
||||
break;
|
||||
case eHighbyte:
|
||||
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
||||
@ -270,17 +260,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
break;
|
||||
|
||||
default:
|
||||
if (mNbspFound)
|
||||
{
|
||||
/* ISO-8859-1 is a good result candidate for ASCII + NBSP.
|
||||
* (though it could have been any ISO-8859 encoding). */
|
||||
mDetectedCharset = "ISO-8859-1";
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Pure ASCII */
|
||||
mDetectedCharset = "ASCII";
|
||||
}
|
||||
break;
|
||||
}
|
||||
return NS_OK;
|
||||
@ -297,6 +276,29 @@ void nsUniversalDetector::DataEnd()
|
||||
return;
|
||||
}
|
||||
|
||||
if (! mDetectedCharset)
|
||||
{
|
||||
switch (mInputState)
|
||||
{
|
||||
case eEscAscii:
|
||||
case ePureAscii:
|
||||
if (mNbspFound)
|
||||
{
|
||||
/* ISO-8859-1 is a good result candidate for ASCII + NBSP.
|
||||
* (though it could have been any ISO-8859 encoding). */
|
||||
mDetectedCharset = "ISO-8859-1";
|
||||
}
|
||||
else
|
||||
{
|
||||
/* ASCII with the ESC character (or the sequence "~{") is still
|
||||
* ASCII until proven otherwise. */
|
||||
mDetectedCharset = "ASCII";
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (mDetectedCharset)
|
||||
{
|
||||
mDone = PR_TRUE;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user