From 19a730b78b8aa3a38a468f927567a4d94d4f7f2c Mon Sep 17 00:00:00 2001 From: ELynx Date: Mon, 27 Jun 2016 11:52:22 +0300 Subject: [PATCH 01/10] Basic support for Unicode 'slash'uABCD escape notation - parser understands escape sequence and following data --- .../chaiscript/language/chaiscript_parser.hpp | 64 +++++++++++++++++-- 1 file changed, 60 insertions(+), 4 deletions(-) diff --git a/include/chaiscript/language/chaiscript_parser.hpp b/include/chaiscript/language/chaiscript_parser.hpp index 1ea73716..4104278f 100644 --- a/include/chaiscript/language/chaiscript_parser.hpp +++ b/include/chaiscript/language/chaiscript_parser.hpp @@ -928,6 +928,29 @@ namespace chaiscript return false; } + // Generic for u16, u32 and (probably) wchar + template + static string_type str_from_ll(long long val) + { + return string_type(1, string_type::value_type(val)); //size, character + } + + // Specialization for char + template<> + static std::string str_from_ll(long long val) + { + std::string::value_type c[2]; + c[1] = val; + c[0] = val >> 8; + + if (c[0] == 0) + { + return std::string(1, c[1]); //size, character + } + + return std::string(c, 2); //char buffer, size + } + template struct Char_Parser { @@ -938,6 +961,7 @@ namespace chaiscript bool saw_interpolation_marker; bool is_octal; bool is_hex; + bool is_unicode; const bool interpolation_allowed; string_type octal_matches; @@ -950,6 +974,7 @@ namespace chaiscript saw_interpolation_marker(false), is_octal(false), is_hex(false), + is_unicode(false), interpolation_allowed(t_interpolation_allowed) { } @@ -964,6 +989,10 @@ namespace chaiscript if (is_hex) { process_hex(); } + + if (is_unicode) { + process_unicode(); + } } void process_hex() @@ -985,9 +1014,23 @@ namespace chaiscript is_octal = false; } + + void process_unicode() + { + auto val = stoll(hex_matches, 0, 16); + hex_matches.clear(); + match += str_from_ll(val); + is_escaped = false; + is_unicode = false; + } + void parse(const char_type t_char, const int line, const int col, const std::string &filename) { const bool is_octal_char = t_char >= '0' && t_char <= '7'; + const bool is_hex_char = (t_char >= '0' && t_char <= '9') + || (t_char >= 'a' && t_char <= 'f') + || (t_char >= 'A' && t_char <= 'F'); + if (is_octal) { if (is_octal_char) { octal_matches.push_back(t_char); @@ -1000,10 +1043,6 @@ namespace chaiscript process_octal(); } } else if (is_hex) { - const bool is_hex_char = (t_char >= '0' && t_char <= '9') - || (t_char >= 'a' && t_char <= 'f') - || (t_char >= 'A' && t_char <= 'F'); - if (is_hex_char) { hex_matches.push_back(t_char); @@ -1018,6 +1057,21 @@ namespace chaiscript } else { process_hex(); } + } else if (is_unicode) { + if (is_hex_char) { + hex_matches.push_back(t_char); + + if(hex_matches.size() == 4) { + // Format is specified to be 'slash'uABCD + // on collecting from A to D do parsing + process_unicode(); + } + return; + } else { + // Not a unicode anymore, try parsing any way + // May be someone used 'slash'uAA only + process_unicode(); + } } if (t_char == '\\') { @@ -1034,6 +1088,8 @@ namespace chaiscript octal_matches.push_back(t_char); } else if (t_char == 'x') { is_hex = true; + } else if (t_char == 'u') { + is_unicode = true; } else { switch (t_char) { case ('\'') : match.push_back('\''); break; From bd263555167768ecfa61073bf4d6f2a2e5054fd4 Mon Sep 17 00:00:00 2001 From: ELynx Date: Mon, 27 Jun 2016 12:38:50 +0300 Subject: [PATCH 02/10] No warnings from MSVC --- include/chaiscript/language/chaiscript_parser.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/chaiscript/language/chaiscript_parser.hpp b/include/chaiscript/language/chaiscript_parser.hpp index 4104278f..5e9765f2 100644 --- a/include/chaiscript/language/chaiscript_parser.hpp +++ b/include/chaiscript/language/chaiscript_parser.hpp @@ -940,8 +940,8 @@ namespace chaiscript static std::string str_from_ll(long long val) { std::string::value_type c[2]; - c[1] = val; - c[0] = val >> 8; + c[1] = std::string::value_type(val); + c[0] = std::string::value_type(val >> 8); if (c[0] == 0) { From 2adefaf46d81c12e59985951b7b30da830a3d1e1 Mon Sep 17 00:00:00 2001 From: ELynx Date: Mon, 27 Jun 2016 12:39:03 +0300 Subject: [PATCH 03/10] Basic unit tests --- unittests/string_unicode_ascii.chai | 8 ++++++++ unittests/string_unicode_parse.chai | 11 +++++++++++ unittests/string_unicode_unicode.chai | 5 +++++ 3 files changed, 24 insertions(+) create mode 100644 unittests/string_unicode_ascii.chai create mode 100644 unittests/string_unicode_parse.chai create mode 100644 unittests/string_unicode_unicode.chai diff --git a/unittests/string_unicode_ascii.chai b/unittests/string_unicode_ascii.chai new file mode 100644 index 00000000..aca62d8f --- /dev/null +++ b/unittests/string_unicode_ascii.chai @@ -0,0 +1,8 @@ +assert_equal('\u0020', ' ') +assert_equal('\u0021', '!') +assert_equal('\u0030', '0') +assert_equal('\u0040', '@') +assert_equal('\u005B', '[') +assert_equal('\u005d', ']') +assert_equal('\u0061', 'a') +assert_equal('\u007e', '~') diff --git a/unittests/string_unicode_parse.chai b/unittests/string_unicode_parse.chai new file mode 100644 index 00000000..8807126e --- /dev/null +++ b/unittests/string_unicode_parse.chai @@ -0,0 +1,11 @@ +assert_equal('\u00aa', '\u00AA') +assert_equal('\u00bb', '\uBB') +assert_equal('\ucc', '\u00CC') +assert_equal('\udd', '\uDD') + +assert_equal('\u0ee', '\uEE') +assert_equal('\ue', '\u000E') + +assert_equal("\u30\u31\u32", "123") +assert_equal("\u33Test", "4Test") +assert_equal("Test\u0040", "Test@") diff --git a/unittests/string_unicode_unicode.chai b/unittests/string_unicode_unicode.chai new file mode 100644 index 00000000..3911d8e3 --- /dev/null +++ b/unittests/string_unicode_unicode.chai @@ -0,0 +1,5 @@ +assert_equal('\u0220', 'Ü') +assert_equal("U for \u0220mlauts", "U for Ümlauts") +assert_equal("More \u0220ml\u0228\u0252ts", "More Ümläüts") + +assert_equal("Happy \u30C4 face", "Happy ツ face") From 8478ddc47089f44e2f8685f504325e4e20735e54 Mon Sep 17 00:00:00 2001 From: ELynx Date: Mon, 27 Jun 2016 12:45:38 +0300 Subject: [PATCH 04/10] Move details to detail namespace, make to standard --- .../chaiscript/language/chaiscript_parser.hpp | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/include/chaiscript/language/chaiscript_parser.hpp b/include/chaiscript/language/chaiscript_parser.hpp index 5e9765f2..8120fac6 100644 --- a/include/chaiscript/language/chaiscript_parser.hpp +++ b/include/chaiscript/language/chaiscript_parser.hpp @@ -55,6 +55,29 @@ namespace chaiscript , max_alphabet , lengthof_alphabet = 256 }; + + // Generic for u16, u32 and (probably) wchar + template + static string_type str_from_ll(long long val) + { + return string_type(1, string_type::value_type(val)); //size, character + } + + // Specialization for char + template<> + static std::string str_from_ll(long long val) + { + std::string::value_type c[2]; + c[1] = std::string::value_type(val); + c[0] = std::string::value_type(val >> 8); + + if (c[0] == 0) + { + return std::string(1, c[1]); //size, character + } + + return std::string(c, 2); //char buffer, size + } } class ChaiScript_Parser { @@ -928,29 +951,6 @@ namespace chaiscript return false; } - // Generic for u16, u32 and (probably) wchar - template - static string_type str_from_ll(long long val) - { - return string_type(1, string_type::value_type(val)); //size, character - } - - // Specialization for char - template<> - static std::string str_from_ll(long long val) - { - std::string::value_type c[2]; - c[1] = std::string::value_type(val); - c[0] = std::string::value_type(val >> 8); - - if (c[0] == 0) - { - return std::string(1, c[1]); //size, character - } - - return std::string(c, 2); //char buffer, size - } - template struct Char_Parser { @@ -1019,7 +1019,7 @@ namespace chaiscript { auto val = stoll(hex_matches, 0, 16); hex_matches.clear(); - match += str_from_ll(val); + match += detail::str_from_ll(val); is_escaped = false; is_unicode = false; } From e3e90de02a51f0ce859638854074166e3923e500 Mon Sep 17 00:00:00 2001 From: ELynx Date: Mon, 27 Jun 2016 13:09:32 +0300 Subject: [PATCH 05/10] Proper comparison in unit tests; remove leftover static keyword --- include/chaiscript/language/chaiscript_parser.hpp | 4 ++-- unittests/string_unicode_parse.chai | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/chaiscript/language/chaiscript_parser.hpp b/include/chaiscript/language/chaiscript_parser.hpp index 8120fac6..f67bbf78 100644 --- a/include/chaiscript/language/chaiscript_parser.hpp +++ b/include/chaiscript/language/chaiscript_parser.hpp @@ -58,14 +58,14 @@ namespace chaiscript // Generic for u16, u32 and (probably) wchar template - static string_type str_from_ll(long long val) + string_type str_from_ll(long long val) { return string_type(1, string_type::value_type(val)); //size, character } // Specialization for char template<> - static std::string str_from_ll(long long val) + std::string str_from_ll(long long val) { std::string::value_type c[2]; c[1] = std::string::value_type(val); diff --git a/unittests/string_unicode_parse.chai b/unittests/string_unicode_parse.chai index 8807126e..50da68bb 100644 --- a/unittests/string_unicode_parse.chai +++ b/unittests/string_unicode_parse.chai @@ -6,6 +6,6 @@ assert_equal('\udd', '\uDD') assert_equal('\u0ee', '\uEE') assert_equal('\ue', '\u000E') -assert_equal("\u30\u31\u32", "123") -assert_equal("\u33Test", "4Test") +assert_equal("\u30\u31\u32", "012") +assert_equal("\u33Test", "3Test") assert_equal("Test\u0040", "Test@") From 368a3b78a26e2ae7b2174454c8c0fc45742e6d7d Mon Sep 17 00:00:00 2001 From: ELynx Date: Mon, 27 Jun 2016 13:46:37 +0300 Subject: [PATCH 06/10] create holder class --- .../chaiscript/language/chaiscript_parser.hpp | 34 +++++++++++-------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/include/chaiscript/language/chaiscript_parser.hpp b/include/chaiscript/language/chaiscript_parser.hpp index f67bbf78..c866c1ed 100644 --- a/include/chaiscript/language/chaiscript_parser.hpp +++ b/include/chaiscript/language/chaiscript_parser.hpp @@ -58,26 +58,32 @@ namespace chaiscript // Generic for u16, u32 and (probably) wchar template - string_type str_from_ll(long long val) + struct Char_Parser_Helper { - return string_type(1, string_type::value_type(val)); //size, character - } + static string_type str_from_ll(long long val) + { + return string_type(1, string_type::value_type(val)); //size, character + } + }; // Specialization for char template<> - std::string str_from_ll(long long val) + struct Char_Parser_Helper { - std::string::value_type c[2]; - c[1] = std::string::value_type(val); - c[0] = std::string::value_type(val >> 8); - - if (c[0] == 0) + static std::string str_from_ll(long long val) { - return std::string(1, c[1]); //size, character - } + std::string::value_type c[2]; + c[1] = std::string::value_type(val); + c[0] = std::string::value_type(val >> 8); - return std::string(c, 2); //char buffer, size - } + if (c[0] == 0) + { + return std::string(1, c[1]); //size, character + } + + return std::string(c, 2); //char buffer, size + } + }; } class ChaiScript_Parser { @@ -1019,7 +1025,7 @@ namespace chaiscript { auto val = stoll(hex_matches, 0, 16); hex_matches.clear(); - match += detail::str_from_ll(val); + match += detail::Char_Parser_Helper::str_from_ll(val); is_escaped = false; is_unicode = false; } From 830b7c93ca3a33ad4d20e69699455936a56208a2 Mon Sep 17 00:00:00 2001 From: ELynx Date: Mon, 27 Jun 2016 15:26:07 +0300 Subject: [PATCH 07/10] Fix unit test, limit unit test to UTF-8 --- unittests/string_unicode_unicode.chai | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unittests/string_unicode_unicode.chai b/unittests/string_unicode_unicode.chai index 3911d8e3..93092167 100644 --- a/unittests/string_unicode_unicode.chai +++ b/unittests/string_unicode_unicode.chai @@ -1,5 +1,5 @@ -assert_equal('\u0220', 'Ü') -assert_equal("U for \u0220mlauts", "U for Ümlauts") -assert_equal("More \u0220ml\u0228\u0252ts", "More Ümläüts") +assert_equal('\uc39c', 'Ü') +assert_equal("U for \uc39cmlauts", "U for Ümlauts") +assert_equal("More \uc39cml\uc3a4\uc3bcts", "More Ümläüts") -assert_equal("Happy \u30C4 face", "Happy ツ face") +assert_equal("Thorn \uc3be sign", "Thorn þ sign") From 58ebb22c55414f1533517a985fabc3ee0fea321d Mon Sep 17 00:00:00 2001 From: ELynx Date: Mon, 27 Jun 2016 16:03:32 +0300 Subject: [PATCH 08/10] clean-up conversion for other than std:string --- .../chaiscript/language/chaiscript_parser.hpp | 29 ++++++++++++++----- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/include/chaiscript/language/chaiscript_parser.hpp b/include/chaiscript/language/chaiscript_parser.hpp index c866c1ed..95b297c1 100644 --- a/include/chaiscript/language/chaiscript_parser.hpp +++ b/include/chaiscript/language/chaiscript_parser.hpp @@ -15,6 +15,8 @@ #include #include #include +#include +#include @@ -56,32 +58,43 @@ namespace chaiscript , lengthof_alphabet = 256 }; - // Generic for u16, u32 and (probably) wchar + // Generic for u16, u32 and wchar template struct Char_Parser_Helper { + typedef typename string_type::value_type target_char_type; + static string_type str_from_ll(long long val) { - return string_type(1, string_type::value_type(val)); //size, character + // make proper UTF-8 string + const std::string intermediate = Char_Parser_Helper::str_from_ll(val); + // prepare converter + std::wstring_convert, target_char_type> converter; + // convert + const string_type result = converter.from_bytes(intermediate); + + return result; } }; - // Specialization for char + // Specialization for char AKA UTF-8 template<> struct Char_Parser_Helper { + typedef std::string::value_type char_type; + static std::string str_from_ll(long long val) { - std::string::value_type c[2]; - c[1] = std::string::value_type(val); - c[0] = std::string::value_type(val >> 8); + char_type c[2]; + c[1] = char_type(val); + c[0] = char_type(val >> 8); if (c[0] == 0) { - return std::string(1, c[1]); //size, character + return std::string(1, c[1]); // size, character } - return std::string(c, 2); //char buffer, size + return std::string(c, 2); // char buffer, size } }; } From 201fef49c6ff10c59e688127c113ad12674d18cd Mon Sep 17 00:00:00 2001 From: ELynx Date: Mon, 27 Jun 2016 17:40:43 +0300 Subject: [PATCH 09/10] More standard compliant, use converter only where available --- include/chaiscript/chaiscript_defines.hpp | 4 ++ .../chaiscript/language/chaiscript_parser.hpp | 52 +++++++++++-------- 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/include/chaiscript/chaiscript_defines.hpp b/include/chaiscript/chaiscript_defines.hpp index 4049381a..b3fabf2f 100644 --- a/include/chaiscript/chaiscript_defines.hpp +++ b/include/chaiscript/chaiscript_defines.hpp @@ -60,6 +60,10 @@ #define CHAISCRIPT_MODULE_EXPORT extern "C" #endif +#if defined(CHAISCRIPT_MSVC) || (defined(__GNUC__) && __GNUC__ >= 5) || defined(CHAISCRIPT_CLANG) +#define CHAISCRIPT_UTF16_UTF32 +#endif + #ifdef _DEBUG #define CHAISCRIPT_DEBUG true #else diff --git a/include/chaiscript/language/chaiscript_parser.hpp b/include/chaiscript/language/chaiscript_parser.hpp index 95b297c1..887400ff 100644 --- a/include/chaiscript/language/chaiscript_parser.hpp +++ b/include/chaiscript/language/chaiscript_parser.hpp @@ -15,8 +15,11 @@ #include #include #include + +#if defined(CHAISCRIPT_UTF16_UTF32) #include #include +#endif @@ -62,29 +65,11 @@ namespace chaiscript template struct Char_Parser_Helper { - typedef typename string_type::value_type target_char_type; - - static string_type str_from_ll(long long val) + // common for all implementations + static std::string u8str_from_ll(long long val) { - // make proper UTF-8 string - const std::string intermediate = Char_Parser_Helper::str_from_ll(val); - // prepare converter - std::wstring_convert, target_char_type> converter; - // convert - const string_type result = converter.from_bytes(intermediate); + typedef std::string::value_type char_type; - return result; - } - }; - - // Specialization for char AKA UTF-8 - template<> - struct Char_Parser_Helper - { - typedef std::string::value_type char_type; - - static std::string str_from_ll(long long val) - { char_type c[2]; c[1] = char_type(val); c[0] = char_type(val >> 8); @@ -96,6 +81,31 @@ namespace chaiscript return std::string(c, 2); // char buffer, size } + + static string_type str_from_ll(long long val) + { + typedef typename string_type::value_type target_char_type; +#if defined (CHAISCRIPT_UTF16_UTF32) + // prepare converter + std::wstring_convert, target_char_type> converter; + // convert + return converter.from_bytes(u8str_from_ll(val)); +#else + // no conversion available, just put value as character + return string_type(1, target_char_type(val)); // size, character +#endif + } + }; + + // Specialization for char AKA UTF-8 + template<> + struct Char_Parser_Helper + { + static std::string str_from_ll(long long val) + { + // little SFINAE trick to avoid base class + return Char_Parser_Helper::u8str_from_ll(val); + } }; } From 5642e062e673bc57926ca4aef0371bd8852396c8 Mon Sep 17 00:00:00 2001 From: ELynx Date: Mon, 27 Jun 2016 18:02:50 +0300 Subject: [PATCH 10/10] Fix unittest --- unittests/string_unicode_unicode.chai | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unittests/string_unicode_unicode.chai b/unittests/string_unicode_unicode.chai index 93092167..2a237ed8 100644 --- a/unittests/string_unicode_unicode.chai +++ b/unittests/string_unicode_unicode.chai @@ -1,4 +1,4 @@ -assert_equal('\uc39c', 'Ü') +assert_equal("\uc39c", "Ü") assert_equal("U for \uc39cmlauts", "U for Ümlauts") assert_equal("More \uc39cml\uc3a4\uc3bcts", "More Ümläüts")