mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2026-02-15 23:00:01 +08:00
src: give a little weight to "probable sequences".
Up to now, we were only considering positive sequences, which are sequences of 2 characters which happen the most. Yet our data gather 4 categories of sequences (the last one being called "negative", since they never happened in our data). I will call the category below positive: probable sequences. They may happen, yet not often. The last category could be called "neutral". This seems to fix the detection of a user's subtitle example without breaking any of our current unit tests. Probably I should still review this whole logics more in details later.
This commit is contained in:
parent
4287d3accc
commit
e0eec3bae8
@ -124,7 +124,7 @@ float nsSingleByteCharSetProber::GetConfidence(void)
|
|||||||
* character). This could make the difference between very closely related
|
* character). This could make the difference between very closely related
|
||||||
* charsets used for the same language.
|
* charsets used for the same language.
|
||||||
*/
|
*/
|
||||||
r = r*mSeqCounters[POSITIVE_CAT] / mTotalChar;
|
r = r * (mSeqCounters[POSITIVE_CAT] + (float) mSeqCounters[PROBABLE_CAT] / 4) / mTotalChar;
|
||||||
/* The more control characters (proportionnaly to the size of the text), the
|
/* The more control characters (proportionnaly to the size of the text), the
|
||||||
* less confident we become in the current charset.
|
* less confident we become in the current charset.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -57,8 +57,11 @@
|
|||||||
#define POSITIVE_SHORTCUT_THRESHOLD (float)0.95
|
#define POSITIVE_SHORTCUT_THRESHOLD (float)0.95
|
||||||
#define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05
|
#define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05
|
||||||
#define SYMBOL_CAT_ORDER 250
|
#define SYMBOL_CAT_ORDER 250
|
||||||
|
|
||||||
#define NUMBER_OF_SEQ_CAT 4
|
#define NUMBER_OF_SEQ_CAT 4
|
||||||
#define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1)
|
#define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1)
|
||||||
|
#define PROBABLE_CAT (NUMBER_OF_SEQ_CAT-2)
|
||||||
|
#define NEUTRAL_CAT (NUMBER_OF_SEQ_CAT-3)
|
||||||
#define NEGATIVE_CAT 0
|
#define NEGATIVE_CAT 0
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user