From 98bf4d73fdc1400a16209cb55840fd7dd46632ab Mon Sep 17 00:00:00 2001 From: Jehan Date: Sun, 28 May 2017 14:06:53 +0200 Subject: [PATCH] Bug 101204 - different results with different chunk sizes. ASCII and ISO-8859-1 should not be detected in nsUniversalDetector::HandleData() but in nsUniversalDetector::DataEnd() instead. Otherwise it creates an unwanted shortcut from the first call to uchardet_handle_data() if the input is broken into several pieces and if the first chunk happens to be ASCII (or ASCII + NBSP). --- src/nsUniversalDetector.cpp | 44 +++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp index 9711618..75474e0 100644 --- a/src/nsUniversalDetector.cpp +++ b/src/nsUniversalDetector.cpp @@ -242,16 +242,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) mDone = PR_TRUE; mDetectedCharset = mEscCharSetProber->GetCharSetName(); } - else if (mNbspFound) - { - mDetectedCharset = "ISO-8859-1"; - } - else - { - /* ASCII with the ESC character (or the sequence "~{") is still - * ASCII until proven otherwise. */ - mDetectedCharset = "ASCII"; - } break; case eHighbyte: for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) @@ -270,17 +260,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) break; default: - if (mNbspFound) - { - /* ISO-8859-1 is a good result candidate for ASCII + NBSP. - * (though it could have been any ISO-8859 encoding). */ - mDetectedCharset = "ISO-8859-1"; - } - else - { - /* Pure ASCII */ - mDetectedCharset = "ASCII"; - } break; } return NS_OK; @@ -297,6 +276,29 @@ void nsUniversalDetector::DataEnd() return; } + if (! mDetectedCharset) + { + switch (mInputState) + { + case eEscAscii: + case ePureAscii: + if (mNbspFound) + { + /* ISO-8859-1 is a good result candidate for ASCII + NBSP. + * (though it could have been any ISO-8859 encoding). */ + mDetectedCharset = "ISO-8859-1"; + } + else + { + /* ASCII with the ESC character (or the sequence "~{") is still + * ASCII until proven otherwise. */ + mDetectedCharset = "ASCII"; + } + default: + break; + } + } + if (mDetectedCharset) { mDone = PR_TRUE;