Bug 101204 - different results with different chunk sizes.

ASCII and ISO-8859-1 should not be detected in nsUniversalDetector::HandleData() but in nsUniversalDetector::DataEnd() instead. Otherwise it creates an unwanted shortcut from the first call to uchardet_handle_data() if the input is broken into several pieces and if the first chunk happens to be ASCII (or ASCII + NBSP).
2026-02-11 04:40:01 +08:00 · 2017-05-28 14:06:53 +02:00 · 2017-05-28 14:06:53 +02:00 · 98bf4d73fd
commit 98bf4d73fd
parent 50743e16f8
1 changed files with 23 additions and 21 deletions
--- a/src/nsUniversalDetector.cpp
+++ b/src/nsUniversalDetector.cpp
@ -242,16 +242,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
      mDone = PR_TRUE;
      mDetectedCharset = mEscCharSetProber->GetCharSetName();
    }
    else if (mNbspFound)
    {
      mDetectedCharset = "ISO-8859-1";
    }
    else
    {
      /* ASCII with the ESC character (or the sequence "~{") is still
       * ASCII until proven otherwise. */
      mDetectedCharset = "ASCII";
    }
    break;
  case eHighbyte:
    for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
@ -270,17 +260,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
    break;
  default:
    if (mNbspFound)
    {
      /* ISO-8859-1 is a good result candidate for ASCII + NBSP.
       * (though it could have been any ISO-8859 encoding). */
      mDetectedCharset = "ISO-8859-1";
    }
    else
    {
      /* Pure ASCII */
      mDetectedCharset = "ASCII";
    }
    break;
  }
  return NS_OK;
@ -297,6 +276,29 @@ void nsUniversalDetector::DataEnd()
    return;
  }
  if (! mDetectedCharset)
  {
    switch (mInputState)
    {
    case eEscAscii:
    case ePureAscii:
      if (mNbspFound)
      {
          /* ISO-8859-1 is a good result candidate for ASCII + NBSP.
           * (though it could have been any ISO-8859 encoding). */
          mDetectedCharset = "ISO-8859-1";
      }
      else
      {
          /* ASCII with the ESC character (or the sequence "~{") is still
           * ASCII until proven otherwise. */
          mDetectedCharset = "ASCII";
      }
    default:
      break;
    }
  }
  if (mDetectedCharset)
  {
    mDone = PR_TRUE;