uchardet_get_charset() must return iconv-compatible names.

It was not clear if our naming followed any kind of rules. In particular, iconv is a widely used encoding conversion API. We will follow its naming. At least 1 returned name was found invalid: x-euc-tw instead of EUC-TW. Other names have been uppercased to follow naming from `iconv --list` though iconv is mostly case-insensitive so it should not have been a problem. "Just in case". Prober names can still have free naming (only used for output display apparently). Finally HZ-GB-2312 is absent from my iconv list, but I can still see this encoding in libiconv master code with this name. So I will consider it valid.
2026-01-01 03:12:24 +08:00 · 2015-11-17 15:52:20 +01:00 · 2015-11-17 15:52:20 +01:00 · dc371f3ba9
commit dc371f3ba9
parent 256d1957b2
10 changed files with 18 additions and 18 deletions
--- a/README.md
+++ b/README.md
@ -2,7 +2,7 @@

 [uchardet](https://github.com/BYVoid/uchardet) is a C language binding of the original C++ implementation of the universal charset detection library by Mozilla.

-uchardet is an encoding detector library, which takes a sequence of bytes in an unknown character encoding without any additional information, and attempts to determine the encoding of the text.
+uchardet is an encoding detector library, which takes a sequence of bytes in an unknown character encoding without any additional information, and attempts to determine the encoding of the text. Returned encoding names are iconv-compatible.

 The original code of universalchardet is available at http://lxr.mozilla.org/seamonkey/source/extensions/universalchardet/

@ -19,7 +19,7 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj
    * BIG5
    * EUC-TW
    * GB18030
-    * HZ-GB-23121
+    * HZ-GB-2312
  * Japanese
    * ISO-2022-JP
    * SHIFT_JIS
--- a/src/CharDistribution.h
+++ b/src/CharDistribution.h
@ -128,7 +128,7 @@ public:
  EUCTWDistributionAnalysis();
 protected:

-  //for euc-TW encoding, we are interested 
+  //for EUC-TW encoding, we are interested
  //  first  byte range: 0xc4 -- 0xfe
  //  second byte range: 0xa1 -- 0xfe
  //no validation needed here. State machine has done that
--- a/src/nsBig5Prober.h
+++ b/src/nsBig5Prober.h
@ -50,7 +50,7 @@ public:
    Reset();}
  virtual ~nsBig5Prober(void){delete mCodingSM;}
  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
-  const char* GetCharSetName() {return "Big5";}
+  const char* GetCharSetName() {return "BIG5";}
  nsProbingState GetState(void) {return mState;}
  void      Reset(void);
  float     GetConfidence(void);
--- a/src/nsEUCTWProber.h
+++ b/src/nsEUCTWProber.h
@ -50,7 +50,7 @@ public:
    Reset();}
  virtual ~nsEUCTWProber(void){delete mCodingSM;}
  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
-  const char* GetCharSetName() {return "x-euc-tw";}
+  const char* GetCharSetName() {return "EUC-TW";}
  nsProbingState GetState(void) {return mState;}
  void      Reset(void);
  float     GetConfidence(void);
--- a/src/nsGB2312Prober.h
+++ b/src/nsGB2312Prober.h
@ -42,7 +42,7 @@
 #include "nsCodingStateMachine.h"
 #include "CharDistribution.h"

-// We use gb18030 to replace gb2312, because 18030 is a superset. 
+// We use GB18030 to replace GB2312, because 18030 is a superset.

 class nsGB18030Prober: public nsCharSetProber {
 public:
@ -52,7 +52,7 @@ public:
    Reset();}
  virtual ~nsGB18030Prober(void){delete mCodingSM;}
  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
-  const char* GetCharSetName() {return "gb18030";}
+  const char* GetCharSetName() {return "GB18030";}
  nsProbingState GetState(void) {return mState;}
  void      Reset(void);
  float     GetConfidence(void);
--- a/src/nsLatin1Prober.h
+++ b/src/nsLatin1Prober.h
@ -48,7 +48,7 @@ public:
  nsLatin1Prober(void){Reset();}
  virtual ~nsLatin1Prober(void){}
  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
-  const char* GetCharSetName() {return "windows-1252";}
+  const char* GetCharSetName() {return "WINDOWS-1252";}
  nsProbingState GetState(void) {return mState;}
  void      Reset(void);
  float     GetConfidence(void);
--- a/src/nsMBCSGroupProber.cpp
+++ b/src/nsMBCSGroupProber.cpp
@ -44,13 +44,13 @@
 #if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
 const char *ProberName[] = 
 {
-  "UTF8",
+  "UTF-8",
  "SJIS",
-  "EUCJP",
+  "EUC-JP",
  "GB18030",
-  "EUCKR",
+  "EUC-KR",
  "Big5",
-  "EUCTW",
+  "EUC-TW",
 };

 #endif
--- a/src/nsMBCSSM.cpp
+++ b/src/nsMBCSSM.cpp
@ -94,7 +94,7 @@ SMModel const Big5SMModel = {
    5,
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st },
  Big5CharLenTable,
-  "Big5",
+  "BIG5",
 };

 static const PRUint32 EUCJP_cls [ 256 / 8 ] = {
@ -257,10 +257,10 @@ const SMModel EUCTWSMModel = {
   7,
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st },
  EUCTWCharLenTable,
-  "x-euc-tw",
+  "EUC-TW",
 };

-/* obsolete GB2312 by gb18030
+/* obsolete GB2312 by GB18030
 static PRUint32 GB2312_cls [ 256 / 8 ] = {
 //PCK4BITS(0,1,1,1,1,1,1,1),  // 00 - 07 
 PCK4BITS(1,1,1,1,1,1,1,1),  // 00 - 07 
@ -430,7 +430,7 @@ const SMModel SJISSMModel = {
   6,
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st },
  SJISCharLenTable,
-  "Shift_JIS",
+  "SHIFT_JIS",
 };


--- a/src/nsSJISProber.h
+++ b/src/nsSJISProber.h
@ -57,7 +57,7 @@ public:
    Reset();}
  virtual ~nsSJISProber(void){delete mCodingSM;}
  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
-  const char* GetCharSetName() {return "Shift_JIS";}
+  const char* GetCharSetName() {return "SHIFT_JIS";}
  nsProbingState GetState(void) {return mState;}
  void      Reset(void);
  float     GetConfidence(void);
--- a/src/uchardet.h
+++ b/src/uchardet.h
@ -79,7 +79,7 @@ void uchardet_data_end(uchardet_t ud);
 void uchardet_reset(uchardet_t ud);

 /**
- * Get the name of encoding that was detected.
+ * Get an iconv-compatible name of the encoding that was detected.
 * @param ud [in] handle of a instance of uchardet
 * @return name of charset on success and "" on failure or pure ascii.
 */