src: now reporting encoding+confidence and keeping a list.

Preparing for an updated API which will also allow to loop at the
confidence value, as well as get the list of possible candidate (i.e.
all detected encoding which had a confidence value high enough so that
we would even consider them).
It is still only internal logics though.
This commit is contained in:
Jehan 2020-04-23 16:15:54 +02:00
parent a49f8ef6ea
commit 4b7b0476fb
3 changed files with 62 additions and 26 deletions

View File

@ -301,9 +301,12 @@ void nsUniversalDetector::DataEnd()
if (mDetectedCharset)
{
mDone = PR_TRUE;
Report(mDetectedCharset);
return;
/* These cases are limited enough that we are always confident
* when finding them.
*/
mDone = PR_TRUE;
Report(mDetectedCharset, 1.0);
return;
}
switch (mInputState)
@ -311,24 +314,18 @@ void nsUniversalDetector::DataEnd()
case eHighbyte:
{
float proberConfidence;
float maxProberConfidence = (float)0.0;
PRInt32 maxProber = 0;
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
{
if (mCharSetProbers[i])
{
proberConfidence = mCharSetProbers[i]->GetConfidence();
if (proberConfidence > maxProberConfidence)
{
maxProberConfidence = proberConfidence;
maxProber = i;
}
if (proberConfidence > MINIMUM_THRESHOLD)
/* Only report what we are confident in. */
Report(mCharSetProbers[i]->GetCharSetName(), proberConfidence);
}
}
//do not report anything because we are not confident of it, that's in fact a negative answer
if (maxProberConfidence > MINIMUM_THRESHOLD)
Report(mCharSetProbers[maxProber]->GetCharSetName());
}
break;
case eEscAscii:

View File

@ -69,7 +69,8 @@ public:
virtual void DataEnd(void);
protected:
virtual void Report(const char* aCharset) = 0;
virtual void Report(const char* aCharset,
float confidence) = 0;
virtual void Reset();
nsInputState mInputState;
PRBool mNbspFound;

View File

@ -37,45 +37,83 @@
#include "uchardet.h"
#include <string.h>
#include <stdlib.h>
#include <vector>
#include "nscore.h"
#include "nsUniversalDetector.h"
typedef struct _UChardetCandidate
{
char *encoding;
char *language;
float confidence;
} UChardetCandidate;
class HandleUniversalDetector : public nsUniversalDetector
{
protected:
char *m_charset;
std::vector<UChardetCandidate> candidates;
public:
HandleUniversalDetector()
: nsUniversalDetector(NS_FILTER_ALL)
, m_charset(0)
{
}
virtual ~HandleUniversalDetector()
{
if (m_charset)
free(m_charset);
Reset();
}
virtual void Report(const char* charset)
virtual void Report(const char *encoding,
float confidence)
{
if (m_charset)
free(m_charset);
m_charset = strdup(charset);
std::vector<UChardetCandidate>::iterator it;
UChardetCandidate candidate;
for (it = candidates.begin(); it != candidates.end(); it++)
{
if (strcmp(it->encoding, encoding) == 0)
{
/* Already reported. Bail out or update the confidence
* when needed.
*/
if (confidence > it->confidence)
{
candidates.erase(it);
break;
}
else
{
return;
}
}
}
candidate = UChardetCandidate();
candidate.encoding = strdup(encoding);
candidate.confidence = confidence;
for (it = candidates.begin(); it != candidates.end(); it++)
{
if (it->confidence < confidence)
break;
}
candidates.insert(it, candidate);
}
virtual void Reset()
{
std::vector<UChardetCandidate>::iterator it;
nsUniversalDetector::Reset();
if (m_charset)
free(m_charset);
m_charset = strdup("");
for (it = candidates.begin(); it != candidates.end(); it++)
free(it->encoding);
candidates.clear();
}
const char* GetCharset() const
{
return m_charset? m_charset : "";
return (candidates.size() > 0) ? candidates[0].encoding : "";
}
};