mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
src: consider any combination with a non-frequent character as sequence.
Basically since we excluse non-letters (Control chars, punctuations, spaces, separators, emoticones and whatnot), we consider any remaining character as an off-script letter (we may have forgotten some cases, but so far, it looks promising). Hence it is normal to consider a combination with these (i.e. 2 off-script letters or 1 frequent letter + 1 off-script, in any order) as a sequence too. Doing so will drop the confidence even more of any text having too much of these. As a consequence, it expands again the gap between the first and second contender, which seems to really show it works.
This commit is contained in:
parent
a1b186fa8b
commit
8113f604de
@ -143,6 +143,11 @@ nsDetectState nsLanguageDetector::HandleData(const int* codePoints, PRUint32 cpL
|
||||
* but they will drop a bit the confidence.
|
||||
*/
|
||||
mOutChar++;
|
||||
order = -2;
|
||||
|
||||
if (mLastOrder == -2 || mLastOrder >= 0)
|
||||
/* Adding a non frequent sequence. */
|
||||
mTotalSeqs++;
|
||||
}
|
||||
}
|
||||
else if (order < mModel->freqCharCount)
|
||||
@ -154,6 +159,11 @@ nsDetectState nsLanguageDetector::HandleData(const int* codePoints, PRUint32 cpL
|
||||
mTotalSeqs++;
|
||||
++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*mModel->freqCharCount+order]]);
|
||||
}
|
||||
else if (mLastOrder == -2)
|
||||
{
|
||||
/* Adding a non frequent sequence. */
|
||||
mTotalSeqs++;
|
||||
}
|
||||
}
|
||||
mLastOrder = order;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user