From dc371f3ba93cb5961887be54acef42eb08a104dd Mon Sep 17 00:00:00 2001 From: Jehan Date: Tue, 17 Nov 2015 15:52:20 +0100 Subject: [PATCH] uchardet_get_charset() must return iconv-compatible names. It was not clear if our naming followed any kind of rules. In particular, iconv is a widely used encoding conversion API. We will follow its naming. At least 1 returned name was found invalid: x-euc-tw instead of EUC-TW. Other names have been uppercased to follow naming from `iconv --list` though iconv is mostly case-insensitive so it should not have been a problem. "Just in case". Prober names can still have free naming (only used for output display apparently). Finally HZ-GB-2312 is absent from my iconv list, but I can still see this encoding in libiconv master code with this name. So I will consider it valid. --- README.md | 4 ++-- src/CharDistribution.h | 2 +- src/nsBig5Prober.h | 2 +- src/nsEUCTWProber.h | 2 +- src/nsGB2312Prober.h | 4 ++-- src/nsLatin1Prober.h | 2 +- src/nsMBCSGroupProber.cpp | 8 ++++---- src/nsMBCSSM.cpp | 8 ++++---- src/nsSJISProber.h | 2 +- src/uchardet.h | 2 +- 10 files changed, 18 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 943b03e..ade756d 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [uchardet](https://github.com/BYVoid/uchardet) is a C language binding of the original C++ implementation of the universal charset detection library by Mozilla. -uchardet is an encoding detector library, which takes a sequence of bytes in an unknown character encoding without any additional information, and attempts to determine the encoding of the text. +uchardet is an encoding detector library, which takes a sequence of bytes in an unknown character encoding without any additional information, and attempts to determine the encoding of the text. Returned encoding names are iconv-compatible. The original code of universalchardet is available at http://lxr.mozilla.org/seamonkey/source/extensions/universalchardet/ @@ -19,7 +19,7 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj * BIG5 * EUC-TW * GB18030 - * HZ-GB-23121 + * HZ-GB-2312 * Japanese * ISO-2022-JP * SHIFT_JIS diff --git a/src/CharDistribution.h b/src/CharDistribution.h index 453c2de..368ac3e 100644 --- a/src/CharDistribution.h +++ b/src/CharDistribution.h @@ -128,7 +128,7 @@ public: EUCTWDistributionAnalysis(); protected: - //for euc-TW encoding, we are interested + //for EUC-TW encoding, we are interested // first byte range: 0xc4 -- 0xfe // second byte range: 0xa1 -- 0xfe //no validation needed here. State machine has done that diff --git a/src/nsBig5Prober.h b/src/nsBig5Prober.h index 5ae3576..7d13be8 100644 --- a/src/nsBig5Prober.h +++ b/src/nsBig5Prober.h @@ -50,7 +50,7 @@ public: Reset();} virtual ~nsBig5Prober(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "Big5";} + const char* GetCharSetName() {return "BIG5";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsEUCTWProber.h b/src/nsEUCTWProber.h index 911d50b..ee6376e 100644 --- a/src/nsEUCTWProber.h +++ b/src/nsEUCTWProber.h @@ -50,7 +50,7 @@ public: Reset();} virtual ~nsEUCTWProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "x-euc-tw";} + const char* GetCharSetName() {return "EUC-TW";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsGB2312Prober.h b/src/nsGB2312Prober.h index 4bdac3b..26ebf84 100644 --- a/src/nsGB2312Prober.h +++ b/src/nsGB2312Prober.h @@ -42,7 +42,7 @@ #include "nsCodingStateMachine.h" #include "CharDistribution.h" -// We use gb18030 to replace gb2312, because 18030 is a superset. +// We use GB18030 to replace GB2312, because 18030 is a superset. class nsGB18030Prober: public nsCharSetProber { public: @@ -52,7 +52,7 @@ public: Reset();} virtual ~nsGB18030Prober(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "gb18030";} + const char* GetCharSetName() {return "GB18030";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsLatin1Prober.h b/src/nsLatin1Prober.h index 5145e96..59118a7 100644 --- a/src/nsLatin1Prober.h +++ b/src/nsLatin1Prober.h @@ -48,7 +48,7 @@ public: nsLatin1Prober(void){Reset();} virtual ~nsLatin1Prober(void){} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "windows-1252";} + const char* GetCharSetName() {return "WINDOWS-1252";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp index f161165..057ddb1 100644 --- a/src/nsMBCSGroupProber.cpp +++ b/src/nsMBCSGroupProber.cpp @@ -44,13 +44,13 @@ #if defined(DEBUG_chardet) || defined(DEBUG_jgmyers) const char *ProberName[] = { - "UTF8", + "UTF-8", "SJIS", - "EUCJP", + "EUC-JP", "GB18030", - "EUCKR", + "EUC-KR", "Big5", - "EUCTW", + "EUC-TW", }; #endif diff --git a/src/nsMBCSSM.cpp b/src/nsMBCSSM.cpp index 584e931..5070096 100644 --- a/src/nsMBCSSM.cpp +++ b/src/nsMBCSSM.cpp @@ -94,7 +94,7 @@ SMModel const Big5SMModel = { 5, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st }, Big5CharLenTable, - "Big5", + "BIG5", }; static const PRUint32 EUCJP_cls [ 256 / 8 ] = { @@ -257,10 +257,10 @@ const SMModel EUCTWSMModel = { 7, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st }, EUCTWCharLenTable, - "x-euc-tw", + "EUC-TW", }; -/* obsolete GB2312 by gb18030 +/* obsolete GB2312 by GB18030 static PRUint32 GB2312_cls [ 256 / 8 ] = { //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 @@ -430,7 +430,7 @@ const SMModel SJISSMModel = { 6, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st }, SJISCharLenTable, - "Shift_JIS", + "SHIFT_JIS", }; diff --git a/src/nsSJISProber.h b/src/nsSJISProber.h index 1efb6e3..f326ded 100644 --- a/src/nsSJISProber.h +++ b/src/nsSJISProber.h @@ -57,7 +57,7 @@ public: Reset();} virtual ~nsSJISProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "Shift_JIS";} + const char* GetCharSetName() {return "SHIFT_JIS";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/uchardet.h b/src/uchardet.h index 0ce738f..526548a 100644 --- a/src/uchardet.h +++ b/src/uchardet.h @@ -79,7 +79,7 @@ void uchardet_data_end(uchardet_t ud); void uchardet_reset(uchardet_t ud); /** - * Get the name of encoding that was detected. + * Get an iconv-compatible name of the encoding that was detected. * @param ud [in] handle of a instance of uchardet * @return name of charset on success and "" on failure or pure ascii. */