mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-08 01:36:41 +08:00
Nearly-ASCII text with NBSP is still not ASCII.
There is no "exception" in encoding. The non-breaking space 0xA0 is not ASCII, and therefore returning "ASCII" will later create issues (for instance trying to re-encode with iconv produces an error). This was obviously an explicit decision in original code (according to code comments), probably tied to specifity of the original program from Mozilla. Now we want strict detection. I will return "ISO-8859-1" for "nearly-ASCII texts with NBSP as only exception" (note that I could have returned any ISO-8859 charsets since they all have this character in common).
This commit is contained in:
parent
886e03a523
commit
4c8316f9cf
@ -47,6 +47,7 @@
|
||||
|
||||
nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
|
||||
{
|
||||
mNbspFound = PR_FALSE;
|
||||
mDone = PR_FALSE;
|
||||
mBestGuess = -1; //illegal value as signal
|
||||
mInTag = PR_FALSE;
|
||||
@ -75,6 +76,7 @@ nsUniversalDetector::~nsUniversalDetector()
|
||||
void
|
||||
nsUniversalDetector::Reset()
|
||||
{
|
||||
mNbspFound = PR_FALSE;
|
||||
mDone = PR_FALSE;
|
||||
mBestGuess = -1; //illegal value as signal
|
||||
mInTag = PR_FALSE;
|
||||
@ -162,9 +164,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
PRUint32 i;
|
||||
for (i = 0; i < aLen; i++)
|
||||
{
|
||||
/* Other than 0xA0, if every other character is ASCII, the page is ASCII.
|
||||
/* If every other character is ASCII or 0xA0, we don't run charset
|
||||
* probers.
|
||||
* 0xA0 (NBSP in a few charset) is apparently a rare exception
|
||||
* of non-ASCII character contained in ASCII text. */
|
||||
* of non-ASCII character often contained in nearly-ASCII text. */
|
||||
if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')
|
||||
{
|
||||
/* We got a non-ASCII byte (high-byte) */
|
||||
@ -203,11 +206,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
}
|
||||
else
|
||||
{
|
||||
//ok, just pure ascii so far
|
||||
if ( ePureAscii == mInputState &&
|
||||
(aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
|
||||
/* Just pure ASCII or NBSP so far. */
|
||||
if (aBuf[i] == '\xA0')
|
||||
{
|
||||
//found escape character or HZ "~{"
|
||||
/* ASCII with the only exception of NBSP seems quite common.
|
||||
* I doubt it is really necessary to train a model here, so let's
|
||||
* just make an exception.
|
||||
*/
|
||||
mNbspFound = PR_TRUE;
|
||||
}
|
||||
else if (mInputState == ePureAscii &&
|
||||
(aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')))
|
||||
{
|
||||
/* We found an escape character or HZ "~{". */
|
||||
mInputState = eEscAscii;
|
||||
}
|
||||
mLastChar = aBuf[i];
|
||||
@ -229,6 +240,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
mDone = PR_TRUE;
|
||||
mDetectedCharset = mEscCharSetProber->GetCharSetName();
|
||||
}
|
||||
else if (mNbspFound)
|
||||
{
|
||||
mDetectedCharset = "ISO-8859-1";
|
||||
}
|
||||
else
|
||||
{
|
||||
/* ASCII with the ESC character (or the sequence "~{") is still
|
||||
@ -253,8 +268,17 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
break;
|
||||
|
||||
default:
|
||||
/* Pure ASCII */
|
||||
mDetectedCharset = "ASCII";
|
||||
if (mNbspFound)
|
||||
{
|
||||
/* ISO-8859-1 is a good result candidate for ASCII + NBSP.
|
||||
* (though it could have been any ISO-8859 encoding). */
|
||||
mDetectedCharset = "ISO-8859-1";
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Pure ASCII */
|
||||
mDetectedCharset = "ASCII";
|
||||
}
|
||||
break;
|
||||
}
|
||||
return NS_OK;
|
||||
|
||||
@ -72,6 +72,7 @@ protected:
|
||||
virtual void Report(const char* aCharset) = 0;
|
||||
virtual void Reset();
|
||||
nsInputState mInputState;
|
||||
PRBool mNbspFound;
|
||||
PRBool mDone;
|
||||
PRBool mInTag;
|
||||
PRBool mStart;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user