mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
Bug 101204 - different results with different chunk sizes.
ASCII and ISO-8859-1 should not be detected in nsUniversalDetector::HandleData() but in nsUniversalDetector::DataEnd() instead. Otherwise it creates an unwanted shortcut from the first call to uchardet_handle_data() if the input is broken into several pieces and if the first chunk happens to be ASCII (or ASCII + NBSP).
This commit is contained in:
parent
50743e16f8
commit
98bf4d73fd
@ -242,16 +242,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
|||||||
mDone = PR_TRUE;
|
mDone = PR_TRUE;
|
||||||
mDetectedCharset = mEscCharSetProber->GetCharSetName();
|
mDetectedCharset = mEscCharSetProber->GetCharSetName();
|
||||||
}
|
}
|
||||||
else if (mNbspFound)
|
|
||||||
{
|
|
||||||
mDetectedCharset = "ISO-8859-1";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/* ASCII with the ESC character (or the sequence "~{") is still
|
|
||||||
* ASCII until proven otherwise. */
|
|
||||||
mDetectedCharset = "ASCII";
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
case eHighbyte:
|
case eHighbyte:
|
||||||
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
||||||
@ -270,17 +260,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
if (mNbspFound)
|
|
||||||
{
|
|
||||||
/* ISO-8859-1 is a good result candidate for ASCII + NBSP.
|
|
||||||
* (though it could have been any ISO-8859 encoding). */
|
|
||||||
mDetectedCharset = "ISO-8859-1";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/* Pure ASCII */
|
|
||||||
mDetectedCharset = "ASCII";
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
return NS_OK;
|
return NS_OK;
|
||||||
@ -297,6 +276,29 @@ void nsUniversalDetector::DataEnd()
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (! mDetectedCharset)
|
||||||
|
{
|
||||||
|
switch (mInputState)
|
||||||
|
{
|
||||||
|
case eEscAscii:
|
||||||
|
case ePureAscii:
|
||||||
|
if (mNbspFound)
|
||||||
|
{
|
||||||
|
/* ISO-8859-1 is a good result candidate for ASCII + NBSP.
|
||||||
|
* (though it could have been any ISO-8859 encoding). */
|
||||||
|
mDetectedCharset = "ISO-8859-1";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* ASCII with the ESC character (or the sequence "~{") is still
|
||||||
|
* ASCII until proven otherwise. */
|
||||||
|
mDetectedCharset = "ASCII";
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (mDetectedCharset)
|
if (mDetectedCharset)
|
||||||
{
|
{
|
||||||
mDone = PR_TRUE;
|
mDone = PR_TRUE;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user