diff --git a/include/chaiscript/chaiscript_defines.hpp b/include/chaiscript/chaiscript_defines.hpp index 4049381a..b3fabf2f 100644 --- a/include/chaiscript/chaiscript_defines.hpp +++ b/include/chaiscript/chaiscript_defines.hpp @@ -60,6 +60,10 @@ #define CHAISCRIPT_MODULE_EXPORT extern "C" #endif +#if defined(CHAISCRIPT_MSVC) || (defined(__GNUC__) && __GNUC__ >= 5) || defined(CHAISCRIPT_CLANG) +#define CHAISCRIPT_UTF16_UTF32 +#endif + #ifdef _DEBUG #define CHAISCRIPT_DEBUG true #else diff --git a/include/chaiscript/language/chaiscript_parser.hpp b/include/chaiscript/language/chaiscript_parser.hpp index 1ea73716..887400ff 100644 --- a/include/chaiscript/language/chaiscript_parser.hpp +++ b/include/chaiscript/language/chaiscript_parser.hpp @@ -16,6 +16,11 @@ #include #include +#if defined(CHAISCRIPT_UTF16_UTF32) +#include +#include +#endif + #include "../dispatchkit/boxed_value.hpp" @@ -55,6 +60,53 @@ namespace chaiscript , max_alphabet , lengthof_alphabet = 256 }; + + // Generic for u16, u32 and wchar + template + struct Char_Parser_Helper + { + // common for all implementations + static std::string u8str_from_ll(long long val) + { + typedef std::string::value_type char_type; + + char_type c[2]; + c[1] = char_type(val); + c[0] = char_type(val >> 8); + + if (c[0] == 0) + { + return std::string(1, c[1]); // size, character + } + + return std::string(c, 2); // char buffer, size + } + + static string_type str_from_ll(long long val) + { + typedef typename string_type::value_type target_char_type; +#if defined (CHAISCRIPT_UTF16_UTF32) + // prepare converter + std::wstring_convert, target_char_type> converter; + // convert + return converter.from_bytes(u8str_from_ll(val)); +#else + // no conversion available, just put value as character + return string_type(1, target_char_type(val)); // size, character +#endif + } + }; + + // Specialization for char AKA UTF-8 + template<> + struct Char_Parser_Helper + { + static std::string str_from_ll(long long val) + { + // little SFINAE trick to avoid base class + return Char_Parser_Helper::u8str_from_ll(val); + } + }; } class ChaiScript_Parser { @@ -938,6 +990,7 @@ namespace chaiscript bool saw_interpolation_marker; bool is_octal; bool is_hex; + bool is_unicode; const bool interpolation_allowed; string_type octal_matches; @@ -950,6 +1003,7 @@ namespace chaiscript saw_interpolation_marker(false), is_octal(false), is_hex(false), + is_unicode(false), interpolation_allowed(t_interpolation_allowed) { } @@ -964,6 +1018,10 @@ namespace chaiscript if (is_hex) { process_hex(); } + + if (is_unicode) { + process_unicode(); + } } void process_hex() @@ -985,9 +1043,23 @@ namespace chaiscript is_octal = false; } + + void process_unicode() + { + auto val = stoll(hex_matches, 0, 16); + hex_matches.clear(); + match += detail::Char_Parser_Helper::str_from_ll(val); + is_escaped = false; + is_unicode = false; + } + void parse(const char_type t_char, const int line, const int col, const std::string &filename) { const bool is_octal_char = t_char >= '0' && t_char <= '7'; + const bool is_hex_char = (t_char >= '0' && t_char <= '9') + || (t_char >= 'a' && t_char <= 'f') + || (t_char >= 'A' && t_char <= 'F'); + if (is_octal) { if (is_octal_char) { octal_matches.push_back(t_char); @@ -1000,10 +1072,6 @@ namespace chaiscript process_octal(); } } else if (is_hex) { - const bool is_hex_char = (t_char >= '0' && t_char <= '9') - || (t_char >= 'a' && t_char <= 'f') - || (t_char >= 'A' && t_char <= 'F'); - if (is_hex_char) { hex_matches.push_back(t_char); @@ -1018,6 +1086,21 @@ namespace chaiscript } else { process_hex(); } + } else if (is_unicode) { + if (is_hex_char) { + hex_matches.push_back(t_char); + + if(hex_matches.size() == 4) { + // Format is specified to be 'slash'uABCD + // on collecting from A to D do parsing + process_unicode(); + } + return; + } else { + // Not a unicode anymore, try parsing any way + // May be someone used 'slash'uAA only + process_unicode(); + } } if (t_char == '\\') { @@ -1034,6 +1117,8 @@ namespace chaiscript octal_matches.push_back(t_char); } else if (t_char == 'x') { is_hex = true; + } else if (t_char == 'u') { + is_unicode = true; } else { switch (t_char) { case ('\'') : match.push_back('\''); break; diff --git a/unittests/string_unicode_ascii.chai b/unittests/string_unicode_ascii.chai new file mode 100644 index 00000000..aca62d8f --- /dev/null +++ b/unittests/string_unicode_ascii.chai @@ -0,0 +1,8 @@ +assert_equal('\u0020', ' ') +assert_equal('\u0021', '!') +assert_equal('\u0030', '0') +assert_equal('\u0040', '@') +assert_equal('\u005B', '[') +assert_equal('\u005d', ']') +assert_equal('\u0061', 'a') +assert_equal('\u007e', '~') diff --git a/unittests/string_unicode_parse.chai b/unittests/string_unicode_parse.chai new file mode 100644 index 00000000..50da68bb --- /dev/null +++ b/unittests/string_unicode_parse.chai @@ -0,0 +1,11 @@ +assert_equal('\u00aa', '\u00AA') +assert_equal('\u00bb', '\uBB') +assert_equal('\ucc', '\u00CC') +assert_equal('\udd', '\uDD') + +assert_equal('\u0ee', '\uEE') +assert_equal('\ue', '\u000E') + +assert_equal("\u30\u31\u32", "012") +assert_equal("\u33Test", "3Test") +assert_equal("Test\u0040", "Test@") diff --git a/unittests/string_unicode_unicode.chai b/unittests/string_unicode_unicode.chai new file mode 100644 index 00000000..2a237ed8 --- /dev/null +++ b/unittests/string_unicode_unicode.chai @@ -0,0 +1,5 @@ +assert_equal("\uc39c", "Ü") +assert_equal("U for \uc39cmlauts", "U for Ümlauts") +assert_equal("More \uc39cml\uc3a4\uc3bcts", "More Ümläüts") + +assert_equal("Thorn \uc3be sign", "Thorn þ sign")