uchardet_get_charset() must return iconv-compatible names.

It was not clear if our naming followed any kind of rules. In particular,
iconv is a widely used encoding conversion API. We will follow its
naming.
At least 1 returned name was found invalid: x-euc-tw instead of EUC-TW.
Other names have been uppercased to follow naming from `iconv --list`
though iconv is mostly case-insensitive so it should not have been a
problem. "Just in case".
Prober names can still have free naming (only used for output display
apparently).
Finally HZ-GB-2312 is absent from my iconv list, but I can still see
this encoding in libiconv master code with this name. So I will
consider it valid.
This commit is contained in:
Jehan 2015-11-17 15:52:20 +01:00
parent 256d1957b2
commit dc371f3ba9
10 changed files with 18 additions and 18 deletions

View File

@ -2,7 +2,7 @@
[uchardet](https://github.com/BYVoid/uchardet) is a C language binding of the original C++ implementation of the universal charset detection library by Mozilla. [uchardet](https://github.com/BYVoid/uchardet) is a C language binding of the original C++ implementation of the universal charset detection library by Mozilla.
uchardet is an encoding detector library, which takes a sequence of bytes in an unknown character encoding without any additional information, and attempts to determine the encoding of the text. uchardet is an encoding detector library, which takes a sequence of bytes in an unknown character encoding without any additional information, and attempts to determine the encoding of the text. Returned encoding names are iconv-compatible.
The original code of universalchardet is available at http://lxr.mozilla.org/seamonkey/source/extensions/universalchardet/ The original code of universalchardet is available at http://lxr.mozilla.org/seamonkey/source/extensions/universalchardet/
@ -19,7 +19,7 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj
* BIG5 * BIG5
* EUC-TW * EUC-TW
* GB18030 * GB18030
* HZ-GB-23121 * HZ-GB-2312
* Japanese * Japanese
* ISO-2022-JP * ISO-2022-JP
* SHIFT_JIS * SHIFT_JIS

View File

@ -128,7 +128,7 @@ public:
EUCTWDistributionAnalysis(); EUCTWDistributionAnalysis();
protected: protected:
//for euc-TW encoding, we are interested //for EUC-TW encoding, we are interested
// first byte range: 0xc4 -- 0xfe // first byte range: 0xc4 -- 0xfe
// second byte range: 0xa1 -- 0xfe // second byte range: 0xa1 -- 0xfe
//no validation needed here. State machine has done that //no validation needed here. State machine has done that

View File

@ -50,7 +50,7 @@ public:
Reset();} Reset();}
virtual ~nsBig5Prober(void){delete mCodingSM;} virtual ~nsBig5Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen); nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "Big5";} const char* GetCharSetName() {return "BIG5";}
nsProbingState GetState(void) {return mState;} nsProbingState GetState(void) {return mState;}
void Reset(void); void Reset(void);
float GetConfidence(void); float GetConfidence(void);

View File

@ -50,7 +50,7 @@ public:
Reset();} Reset();}
virtual ~nsEUCTWProber(void){delete mCodingSM;} virtual ~nsEUCTWProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen); nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "x-euc-tw";} const char* GetCharSetName() {return "EUC-TW";}
nsProbingState GetState(void) {return mState;} nsProbingState GetState(void) {return mState;}
void Reset(void); void Reset(void);
float GetConfidence(void); float GetConfidence(void);

View File

@ -42,7 +42,7 @@
#include "nsCodingStateMachine.h" #include "nsCodingStateMachine.h"
#include "CharDistribution.h" #include "CharDistribution.h"
// We use gb18030 to replace gb2312, because 18030 is a superset. // We use GB18030 to replace GB2312, because 18030 is a superset.
class nsGB18030Prober: public nsCharSetProber { class nsGB18030Prober: public nsCharSetProber {
public: public:
@ -52,7 +52,7 @@ public:
Reset();} Reset();}
virtual ~nsGB18030Prober(void){delete mCodingSM;} virtual ~nsGB18030Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen); nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "gb18030";} const char* GetCharSetName() {return "GB18030";}
nsProbingState GetState(void) {return mState;} nsProbingState GetState(void) {return mState;}
void Reset(void); void Reset(void);
float GetConfidence(void); float GetConfidence(void);

View File

@ -48,7 +48,7 @@ public:
nsLatin1Prober(void){Reset();} nsLatin1Prober(void){Reset();}
virtual ~nsLatin1Prober(void){} virtual ~nsLatin1Prober(void){}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen); nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "windows-1252";} const char* GetCharSetName() {return "WINDOWS-1252";}
nsProbingState GetState(void) {return mState;} nsProbingState GetState(void) {return mState;}
void Reset(void); void Reset(void);
float GetConfidence(void); float GetConfidence(void);

View File

@ -44,13 +44,13 @@
#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers) #if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
const char *ProberName[] = const char *ProberName[] =
{ {
"UTF8", "UTF-8",
"SJIS", "SJIS",
"EUCJP", "EUC-JP",
"GB18030", "GB18030",
"EUCKR", "EUC-KR",
"Big5", "Big5",
"EUCTW", "EUC-TW",
}; };
#endif #endif

View File

@ -94,7 +94,7 @@ SMModel const Big5SMModel = {
5, 5,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st },
Big5CharLenTable, Big5CharLenTable,
"Big5", "BIG5",
}; };
static const PRUint32 EUCJP_cls [ 256 / 8 ] = { static const PRUint32 EUCJP_cls [ 256 / 8 ] = {
@ -257,10 +257,10 @@ const SMModel EUCTWSMModel = {
7, 7,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st },
EUCTWCharLenTable, EUCTWCharLenTable,
"x-euc-tw", "EUC-TW",
}; };
/* obsolete GB2312 by gb18030 /* obsolete GB2312 by GB18030
static PRUint32 GB2312_cls [ 256 / 8 ] = { static PRUint32 GB2312_cls [ 256 / 8 ] = {
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
@ -430,7 +430,7 @@ const SMModel SJISSMModel = {
6, 6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st },
SJISCharLenTable, SJISCharLenTable,
"Shift_JIS", "SHIFT_JIS",
}; };

View File

@ -57,7 +57,7 @@ public:
Reset();} Reset();}
virtual ~nsSJISProber(void){delete mCodingSM;} virtual ~nsSJISProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen); nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "Shift_JIS";} const char* GetCharSetName() {return "SHIFT_JIS";}
nsProbingState GetState(void) {return mState;} nsProbingState GetState(void) {return mState;}
void Reset(void); void Reset(void);
float GetConfidence(void); float GetConfidence(void);

View File

@ -79,7 +79,7 @@ void uchardet_data_end(uchardet_t ud);
void uchardet_reset(uchardet_t ud); void uchardet_reset(uchardet_t ud);
/** /**
* Get the name of encoding that was detected. * Get an iconv-compatible name of the encoding that was detected.
* @param ud [in] handle of a instance of uchardet * @param ud [in] handle of a instance of uchardet
* @return name of charset on success and "" on failure or pure ascii. * @return name of charset on success and "" on failure or pure ascii.
*/ */