Bug 101204 - different results with different chunk sizes.

ASCII and ISO-8859-1 should not be detected in
nsUniversalDetector::HandleData() but in nsUniversalDetector::DataEnd()
instead. Otherwise it creates an unwanted shortcut from the first call
to uchardet_handle_data() if the input is broken into several pieces and
if the first chunk happens to be ASCII (or ASCII + NBSP).
This commit is contained in:
Jehan 2017-05-28 14:06:53 +02:00
parent 50743e16f8
commit 98bf4d73fd

View File

@ -242,16 +242,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
mDone = PR_TRUE; mDone = PR_TRUE;
mDetectedCharset = mEscCharSetProber->GetCharSetName(); mDetectedCharset = mEscCharSetProber->GetCharSetName();
} }
else if (mNbspFound)
{
mDetectedCharset = "ISO-8859-1";
}
else
{
/* ASCII with the ESC character (or the sequence "~{") is still
* ASCII until proven otherwise. */
mDetectedCharset = "ASCII";
}
break; break;
case eHighbyte: case eHighbyte:
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
@ -270,17 +260,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
break; break;
default: default:
if (mNbspFound)
{
/* ISO-8859-1 is a good result candidate for ASCII + NBSP.
* (though it could have been any ISO-8859 encoding). */
mDetectedCharset = "ISO-8859-1";
}
else
{
/* Pure ASCII */
mDetectedCharset = "ASCII";
}
break; break;
} }
return NS_OK; return NS_OK;
@ -297,6 +276,29 @@ void nsUniversalDetector::DataEnd()
return; return;
} }
if (! mDetectedCharset)
{
switch (mInputState)
{
case eEscAscii:
case ePureAscii:
if (mNbspFound)
{
/* ISO-8859-1 is a good result candidate for ASCII + NBSP.
* (though it could have been any ISO-8859 encoding). */
mDetectedCharset = "ISO-8859-1";
}
else
{
/* ASCII with the ESC character (or the sequence "~{") is still
* ASCII until proven otherwise. */
mDetectedCharset = "ASCII";
}
default:
break;
}
}
if (mDetectedCharset) if (mDetectedCharset)
{ {
mDone = PR_TRUE; mDone = PR_TRUE;