uchardet_get_charset() must return iconv-compatible names.

It was not clear if our naming followed any kind of rules. In particular,
iconv is a widely used encoding conversion API. We will follow its
naming.
At least 1 returned name was found invalid: x-euc-tw instead of EUC-TW.
Other names have been uppercased to follow naming from `iconv --list`
though iconv is mostly case-insensitive so it should not have been a
problem. "Just in case".
Prober names can still have free naming (only used for output display
apparently).
Finally HZ-GB-2312 is absent from my iconv list, but I can still see
this encoding in libiconv master code with this name. So I will
consider it valid.
This commit is contained in:
Jehan 2015-11-17 15:52:20 +01:00
parent 256d1957b2
commit dc371f3ba9
10 changed files with 18 additions and 18 deletions

View File

@ -2,7 +2,7 @@
[uchardet](https://github.com/BYVoid/uchardet) is a C language binding of the original C++ implementation of the universal charset detection library by Mozilla.
uchardet is an encoding detector library, which takes a sequence of bytes in an unknown character encoding without any additional information, and attempts to determine the encoding of the text.
uchardet is an encoding detector library, which takes a sequence of bytes in an unknown character encoding without any additional information, and attempts to determine the encoding of the text. Returned encoding names are iconv-compatible.
The original code of universalchardet is available at http://lxr.mozilla.org/seamonkey/source/extensions/universalchardet/
@ -19,7 +19,7 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj
* BIG5
* EUC-TW
* GB18030
* HZ-GB-23121
* HZ-GB-2312
* Japanese
* ISO-2022-JP
* SHIFT_JIS

View File

@ -128,7 +128,7 @@ public:
EUCTWDistributionAnalysis();
protected:
//for euc-TW encoding, we are interested
//for EUC-TW encoding, we are interested
// first byte range: 0xc4 -- 0xfe
// second byte range: 0xa1 -- 0xfe
//no validation needed here. State machine has done that

View File

@ -50,7 +50,7 @@ public:
Reset();}
virtual ~nsBig5Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "Big5";}
const char* GetCharSetName() {return "BIG5";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);

View File

@ -50,7 +50,7 @@ public:
Reset();}
virtual ~nsEUCTWProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "x-euc-tw";}
const char* GetCharSetName() {return "EUC-TW";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);

View File

@ -42,7 +42,7 @@
#include "nsCodingStateMachine.h"
#include "CharDistribution.h"
// We use gb18030 to replace gb2312, because 18030 is a superset.
// We use GB18030 to replace GB2312, because 18030 is a superset.
class nsGB18030Prober: public nsCharSetProber {
public:
@ -52,7 +52,7 @@ public:
Reset();}
virtual ~nsGB18030Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "gb18030";}
const char* GetCharSetName() {return "GB18030";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);

View File

@ -48,7 +48,7 @@ public:
nsLatin1Prober(void){Reset();}
virtual ~nsLatin1Prober(void){}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "windows-1252";}
const char* GetCharSetName() {return "WINDOWS-1252";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);

View File

@ -44,13 +44,13 @@
#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
const char *ProberName[] =
{
"UTF8",
"UTF-8",
"SJIS",
"EUCJP",
"EUC-JP",
"GB18030",
"EUCKR",
"EUC-KR",
"Big5",
"EUCTW",
"EUC-TW",
};
#endif

View File

@ -94,7 +94,7 @@ SMModel const Big5SMModel = {
5,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st },
Big5CharLenTable,
"Big5",
"BIG5",
};
static const PRUint32 EUCJP_cls [ 256 / 8 ] = {
@ -257,10 +257,10 @@ const SMModel EUCTWSMModel = {
7,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st },
EUCTWCharLenTable,
"x-euc-tw",
"EUC-TW",
};
/* obsolete GB2312 by gb18030
/* obsolete GB2312 by GB18030
static PRUint32 GB2312_cls [ 256 / 8 ] = {
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
@ -430,7 +430,7 @@ const SMModel SJISSMModel = {
6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st },
SJISCharLenTable,
"Shift_JIS",
"SHIFT_JIS",
};

View File

@ -57,7 +57,7 @@ public:
Reset();}
virtual ~nsSJISProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "Shift_JIS";}
const char* GetCharSetName() {return "SHIFT_JIS";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);

View File

@ -79,7 +79,7 @@ void uchardet_data_end(uchardet_t ud);
void uchardet_reset(uchardet_t ud);
/**
* Get the name of encoding that was detected.
* Get an iconv-compatible name of the encoding that was detected.
* @param ud [in] handle of a instance of uchardet
* @return name of charset on success and "" on failure or pure ascii.
*/