mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-08 01:36:41 +08:00
src: now reporting encoding+confidence and keeping a list.
Preparing for an updated API which will also allow to loop at the confidence value, as well as get the list of possible candidate (i.e. all detected encoding which had a confidence value high enough so that we would even consider them). It is still only internal logics though.
This commit is contained in:
parent
a49f8ef6ea
commit
4b7b0476fb
@ -301,9 +301,12 @@ void nsUniversalDetector::DataEnd()
|
||||
|
||||
if (mDetectedCharset)
|
||||
{
|
||||
mDone = PR_TRUE;
|
||||
Report(mDetectedCharset);
|
||||
return;
|
||||
/* These cases are limited enough that we are always confident
|
||||
* when finding them.
|
||||
*/
|
||||
mDone = PR_TRUE;
|
||||
Report(mDetectedCharset, 1.0);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (mInputState)
|
||||
@ -311,24 +314,18 @@ void nsUniversalDetector::DataEnd()
|
||||
case eHighbyte:
|
||||
{
|
||||
float proberConfidence;
|
||||
float maxProberConfidence = (float)0.0;
|
||||
PRInt32 maxProber = 0;
|
||||
|
||||
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
||||
{
|
||||
if (mCharSetProbers[i])
|
||||
{
|
||||
proberConfidence = mCharSetProbers[i]->GetConfidence();
|
||||
if (proberConfidence > maxProberConfidence)
|
||||
{
|
||||
maxProberConfidence = proberConfidence;
|
||||
maxProber = i;
|
||||
}
|
||||
|
||||
if (proberConfidence > MINIMUM_THRESHOLD)
|
||||
/* Only report what we are confident in. */
|
||||
Report(mCharSetProbers[i]->GetCharSetName(), proberConfidence);
|
||||
}
|
||||
}
|
||||
//do not report anything because we are not confident of it, that's in fact a negative answer
|
||||
if (maxProberConfidence > MINIMUM_THRESHOLD)
|
||||
Report(mCharSetProbers[maxProber]->GetCharSetName());
|
||||
}
|
||||
break;
|
||||
case eEscAscii:
|
||||
|
||||
@ -69,7 +69,8 @@ public:
|
||||
virtual void DataEnd(void);
|
||||
|
||||
protected:
|
||||
virtual void Report(const char* aCharset) = 0;
|
||||
virtual void Report(const char* aCharset,
|
||||
float confidence) = 0;
|
||||
virtual void Reset();
|
||||
nsInputState mInputState;
|
||||
PRBool mNbspFound;
|
||||
|
||||
@ -37,45 +37,83 @@
|
||||
#include "uchardet.h"
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <vector>
|
||||
#include "nscore.h"
|
||||
#include "nsUniversalDetector.h"
|
||||
|
||||
typedef struct _UChardetCandidate
|
||||
{
|
||||
char *encoding;
|
||||
char *language;
|
||||
float confidence;
|
||||
} UChardetCandidate;
|
||||
|
||||
class HandleUniversalDetector : public nsUniversalDetector
|
||||
{
|
||||
protected:
|
||||
char *m_charset;
|
||||
std::vector<UChardetCandidate> candidates;
|
||||
|
||||
public:
|
||||
HandleUniversalDetector()
|
||||
: nsUniversalDetector(NS_FILTER_ALL)
|
||||
, m_charset(0)
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~HandleUniversalDetector()
|
||||
{
|
||||
if (m_charset)
|
||||
free(m_charset);
|
||||
Reset();
|
||||
}
|
||||
|
||||
virtual void Report(const char* charset)
|
||||
virtual void Report(const char *encoding,
|
||||
float confidence)
|
||||
{
|
||||
if (m_charset)
|
||||
free(m_charset);
|
||||
m_charset = strdup(charset);
|
||||
std::vector<UChardetCandidate>::iterator it;
|
||||
UChardetCandidate candidate;
|
||||
|
||||
for (it = candidates.begin(); it != candidates.end(); it++)
|
||||
{
|
||||
if (strcmp(it->encoding, encoding) == 0)
|
||||
{
|
||||
/* Already reported. Bail out or update the confidence
|
||||
* when needed.
|
||||
*/
|
||||
if (confidence > it->confidence)
|
||||
{
|
||||
candidates.erase(it);
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
candidate = UChardetCandidate();
|
||||
candidate.encoding = strdup(encoding);
|
||||
candidate.confidence = confidence;
|
||||
|
||||
for (it = candidates.begin(); it != candidates.end(); it++)
|
||||
{
|
||||
if (it->confidence < confidence)
|
||||
break;
|
||||
}
|
||||
candidates.insert(it, candidate);
|
||||
}
|
||||
|
||||
virtual void Reset()
|
||||
{
|
||||
std::vector<UChardetCandidate>::iterator it;
|
||||
|
||||
nsUniversalDetector::Reset();
|
||||
if (m_charset)
|
||||
free(m_charset);
|
||||
m_charset = strdup("");
|
||||
for (it = candidates.begin(); it != candidates.end(); it++)
|
||||
free(it->encoding);
|
||||
candidates.clear();
|
||||
}
|
||||
|
||||
const char* GetCharset() const
|
||||
{
|
||||
return m_charset? m_charset : "";
|
||||
return (candidates.size() > 0) ? candidates[0].encoding : "";
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user