mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
uchardet_get_charset() must return iconv-compatible names.
It was not clear if our naming followed any kind of rules. In particular, iconv is a widely used encoding conversion API. We will follow its naming. At least 1 returned name was found invalid: x-euc-tw instead of EUC-TW. Other names have been uppercased to follow naming from `iconv --list` though iconv is mostly case-insensitive so it should not have been a problem. "Just in case". Prober names can still have free naming (only used for output display apparently). Finally HZ-GB-2312 is absent from my iconv list, but I can still see this encoding in libiconv master code with this name. So I will consider it valid.
This commit is contained in:
parent
256d1957b2
commit
dc371f3ba9
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
[uchardet](https://github.com/BYVoid/uchardet) is a C language binding of the original C++ implementation of the universal charset detection library by Mozilla.
|
[uchardet](https://github.com/BYVoid/uchardet) is a C language binding of the original C++ implementation of the universal charset detection library by Mozilla.
|
||||||
|
|
||||||
uchardet is an encoding detector library, which takes a sequence of bytes in an unknown character encoding without any additional information, and attempts to determine the encoding of the text.
|
uchardet is an encoding detector library, which takes a sequence of bytes in an unknown character encoding without any additional information, and attempts to determine the encoding of the text. Returned encoding names are iconv-compatible.
|
||||||
|
|
||||||
The original code of universalchardet is available at http://lxr.mozilla.org/seamonkey/source/extensions/universalchardet/
|
The original code of universalchardet is available at http://lxr.mozilla.org/seamonkey/source/extensions/universalchardet/
|
||||||
|
|
||||||
@ -19,7 +19,7 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj
|
|||||||
* BIG5
|
* BIG5
|
||||||
* EUC-TW
|
* EUC-TW
|
||||||
* GB18030
|
* GB18030
|
||||||
* HZ-GB-23121
|
* HZ-GB-2312
|
||||||
* Japanese
|
* Japanese
|
||||||
* ISO-2022-JP
|
* ISO-2022-JP
|
||||||
* SHIFT_JIS
|
* SHIFT_JIS
|
||||||
|
|||||||
@ -128,7 +128,7 @@ public:
|
|||||||
EUCTWDistributionAnalysis();
|
EUCTWDistributionAnalysis();
|
||||||
protected:
|
protected:
|
||||||
|
|
||||||
//for euc-TW encoding, we are interested
|
//for EUC-TW encoding, we are interested
|
||||||
// first byte range: 0xc4 -- 0xfe
|
// first byte range: 0xc4 -- 0xfe
|
||||||
// second byte range: 0xa1 -- 0xfe
|
// second byte range: 0xa1 -- 0xfe
|
||||||
//no validation needed here. State machine has done that
|
//no validation needed here. State machine has done that
|
||||||
|
|||||||
@ -50,7 +50,7 @@ public:
|
|||||||
Reset();}
|
Reset();}
|
||||||
virtual ~nsBig5Prober(void){delete mCodingSM;}
|
virtual ~nsBig5Prober(void){delete mCodingSM;}
|
||||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
const char* GetCharSetName() {return "Big5";}
|
const char* GetCharSetName() {return "BIG5";}
|
||||||
nsProbingState GetState(void) {return mState;}
|
nsProbingState GetState(void) {return mState;}
|
||||||
void Reset(void);
|
void Reset(void);
|
||||||
float GetConfidence(void);
|
float GetConfidence(void);
|
||||||
|
|||||||
@ -50,7 +50,7 @@ public:
|
|||||||
Reset();}
|
Reset();}
|
||||||
virtual ~nsEUCTWProber(void){delete mCodingSM;}
|
virtual ~nsEUCTWProber(void){delete mCodingSM;}
|
||||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
const char* GetCharSetName() {return "x-euc-tw";}
|
const char* GetCharSetName() {return "EUC-TW";}
|
||||||
nsProbingState GetState(void) {return mState;}
|
nsProbingState GetState(void) {return mState;}
|
||||||
void Reset(void);
|
void Reset(void);
|
||||||
float GetConfidence(void);
|
float GetConfidence(void);
|
||||||
|
|||||||
@ -42,7 +42,7 @@
|
|||||||
#include "nsCodingStateMachine.h"
|
#include "nsCodingStateMachine.h"
|
||||||
#include "CharDistribution.h"
|
#include "CharDistribution.h"
|
||||||
|
|
||||||
// We use gb18030 to replace gb2312, because 18030 is a superset.
|
// We use GB18030 to replace GB2312, because 18030 is a superset.
|
||||||
|
|
||||||
class nsGB18030Prober: public nsCharSetProber {
|
class nsGB18030Prober: public nsCharSetProber {
|
||||||
public:
|
public:
|
||||||
@ -52,7 +52,7 @@ public:
|
|||||||
Reset();}
|
Reset();}
|
||||||
virtual ~nsGB18030Prober(void){delete mCodingSM;}
|
virtual ~nsGB18030Prober(void){delete mCodingSM;}
|
||||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
const char* GetCharSetName() {return "gb18030";}
|
const char* GetCharSetName() {return "GB18030";}
|
||||||
nsProbingState GetState(void) {return mState;}
|
nsProbingState GetState(void) {return mState;}
|
||||||
void Reset(void);
|
void Reset(void);
|
||||||
float GetConfidence(void);
|
float GetConfidence(void);
|
||||||
|
|||||||
@ -48,7 +48,7 @@ public:
|
|||||||
nsLatin1Prober(void){Reset();}
|
nsLatin1Prober(void){Reset();}
|
||||||
virtual ~nsLatin1Prober(void){}
|
virtual ~nsLatin1Prober(void){}
|
||||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
const char* GetCharSetName() {return "windows-1252";}
|
const char* GetCharSetName() {return "WINDOWS-1252";}
|
||||||
nsProbingState GetState(void) {return mState;}
|
nsProbingState GetState(void) {return mState;}
|
||||||
void Reset(void);
|
void Reset(void);
|
||||||
float GetConfidence(void);
|
float GetConfidence(void);
|
||||||
|
|||||||
@ -44,13 +44,13 @@
|
|||||||
#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
|
#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
|
||||||
const char *ProberName[] =
|
const char *ProberName[] =
|
||||||
{
|
{
|
||||||
"UTF8",
|
"UTF-8",
|
||||||
"SJIS",
|
"SJIS",
|
||||||
"EUCJP",
|
"EUC-JP",
|
||||||
"GB18030",
|
"GB18030",
|
||||||
"EUCKR",
|
"EUC-KR",
|
||||||
"Big5",
|
"Big5",
|
||||||
"EUCTW",
|
"EUC-TW",
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -94,7 +94,7 @@ SMModel const Big5SMModel = {
|
|||||||
5,
|
5,
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st },
|
||||||
Big5CharLenTable,
|
Big5CharLenTable,
|
||||||
"Big5",
|
"BIG5",
|
||||||
};
|
};
|
||||||
|
|
||||||
static const PRUint32 EUCJP_cls [ 256 / 8 ] = {
|
static const PRUint32 EUCJP_cls [ 256 / 8 ] = {
|
||||||
@ -257,10 +257,10 @@ const SMModel EUCTWSMModel = {
|
|||||||
7,
|
7,
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st },
|
||||||
EUCTWCharLenTable,
|
EUCTWCharLenTable,
|
||||||
"x-euc-tw",
|
"EUC-TW",
|
||||||
};
|
};
|
||||||
|
|
||||||
/* obsolete GB2312 by gb18030
|
/* obsolete GB2312 by GB18030
|
||||||
static PRUint32 GB2312_cls [ 256 / 8 ] = {
|
static PRUint32 GB2312_cls [ 256 / 8 ] = {
|
||||||
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
||||||
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
|
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
|
||||||
@ -430,7 +430,7 @@ const SMModel SJISSMModel = {
|
|||||||
6,
|
6,
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st },
|
||||||
SJISCharLenTable,
|
SJISCharLenTable,
|
||||||
"Shift_JIS",
|
"SHIFT_JIS",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -57,7 +57,7 @@ public:
|
|||||||
Reset();}
|
Reset();}
|
||||||
virtual ~nsSJISProber(void){delete mCodingSM;}
|
virtual ~nsSJISProber(void){delete mCodingSM;}
|
||||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
const char* GetCharSetName() {return "Shift_JIS";}
|
const char* GetCharSetName() {return "SHIFT_JIS";}
|
||||||
nsProbingState GetState(void) {return mState;}
|
nsProbingState GetState(void) {return mState;}
|
||||||
void Reset(void);
|
void Reset(void);
|
||||||
float GetConfidence(void);
|
float GetConfidence(void);
|
||||||
|
|||||||
@ -79,7 +79,7 @@ void uchardet_data_end(uchardet_t ud);
|
|||||||
void uchardet_reset(uchardet_t ud);
|
void uchardet_reset(uchardet_t ud);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the name of encoding that was detected.
|
* Get an iconv-compatible name of the encoding that was detected.
|
||||||
* @param ud [in] handle of a instance of uchardet
|
* @param ud [in] handle of a instance of uchardet
|
||||||
* @return name of charset on success and "" on failure or pure ascii.
|
* @return name of charset on success and "" on failure or pure ascii.
|
||||||
*/
|
*/
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user