From e0eec3bae8409d528458c8b69ffd18ff6a52bc94 Mon Sep 17 00:00:00 2001 From: Jehan Date: Wed, 25 May 2016 17:24:37 +0200 Subject: [PATCH] src: give a little weight to "probable sequences". Up to now, we were only considering positive sequences, which are sequences of 2 characters which happen the most. Yet our data gather 4 categories of sequences (the last one being called "negative", since they never happened in our data). I will call the category below positive: probable sequences. They may happen, yet not often. The last category could be called "neutral". This seems to fix the detection of a user's subtitle example without breaking any of our current unit tests. Probably I should still review this whole logics more in details later. --- src/nsSBCharSetProber.cpp | 2 +- src/nsSBCharSetProber.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/nsSBCharSetProber.cpp b/src/nsSBCharSetProber.cpp index cb00fbc..1f7f473 100644 --- a/src/nsSBCharSetProber.cpp +++ b/src/nsSBCharSetProber.cpp @@ -124,7 +124,7 @@ float nsSingleByteCharSetProber::GetConfidence(void) * character). This could make the difference between very closely related * charsets used for the same language. */ - r = r*mSeqCounters[POSITIVE_CAT] / mTotalChar; + r = r * (mSeqCounters[POSITIVE_CAT] + (float) mSeqCounters[PROBABLE_CAT] / 4) / mTotalChar; /* The more control characters (proportionnaly to the size of the text), the * less confident we become in the current charset. */ diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index fb40d3f..211846e 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -57,8 +57,11 @@ #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95 #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 #define SYMBOL_CAT_ORDER 250 + #define NUMBER_OF_SEQ_CAT 4 #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1) +#define PROBABLE_CAT (NUMBER_OF_SEQ_CAT-2) +#define NEUTRAL_CAT (NUMBER_OF_SEQ_CAT-3) #define NEGATIVE_CAT 0 typedef struct