diff --git a/include/chaiscript/language/chaiscript_parser.hpp b/include/chaiscript/language/chaiscript_parser.hpp index c82aff38..649f6f21 100644 --- a/include/chaiscript/language/chaiscript_parser.hpp +++ b/include/chaiscript/language/chaiscript_parser.hpp @@ -1038,7 +1038,7 @@ namespace chaiscript bool saw_interpolation_marker = false; bool is_octal = false; bool is_hex = false; - bool is_unicode = false; + std::size_t unicode_size = 0; const bool interpolation_allowed; string_type octal_matches; @@ -1062,12 +1062,12 @@ namespace chaiscript process_hex(); } - if (is_unicode) { + if (unicode_size > 0) { process_unicode(); } } catch (const std::invalid_argument &) { - // escape sequence was invalid somehow, we'll pick this - // up in the next part of parsing + } catch (const exception::eval_error &) { + // Something happened with parsing, we'll catch it later? } } @@ -1097,13 +1097,43 @@ namespace chaiscript void process_unicode() { - if (!hex_matches.empty()) { - auto val = stoll(hex_matches, nullptr, 16); - hex_matches.clear(); - match += detail::Char_Parser_Helper::str_from_ll(val); - } + const auto ch = static_cast(std::stoi(hex_matches, nullptr, 16)); + const auto match_size = hex_matches.size(); + hex_matches.clear(); is_escaped = false; - is_unicode = false; + const auto u_size = unicode_size; + unicode_size = 0; + + char buf[4]; + if (u_size != match_size) { + throw exception::eval_error("Incomplete unicode escape sequence"); + } + if (u_size == 4 && ch >= 0xD800 && ch <= 0xDFFF) { + throw exception::eval_error("Invalid 16 bit universal character"); + } + + + if (ch < 0x80) { + match += static_cast(ch); + } else if (ch < 0x800) { + buf[0] = static_cast(0xC0 | (ch >> 6)); + buf[1] = static_cast(0x80 | (ch & 0x3F)); + match.append(buf, 2); + } else if (ch < 0x10000) { + buf[0] = static_cast(0xE0 | (ch >> 12)); + buf[1] = static_cast(0x80 | ((ch >> 6) & 0x3F)); + buf[2] = static_cast(0x80 | (ch & 0x3F)); + match.append(buf, 3); + } else if (ch < 0x200000) { + buf[0] = static_cast(0xF0 | (ch >> 18)); + buf[1] = static_cast(0x80 | ((ch >> 12) & 0x3F)); + buf[2] = static_cast(0x80 | ((ch >> 6) & 0x3F)); + buf[3] = static_cast(0x80 | (ch & 0x3F)); + match.append(buf, 4); + } else { + // this must be an invalid escape sequence? + throw exception::eval_error("Invalid 32 bit universal character"); + } } void parse(const char_type t_char, const int line, const int col, const std::string &filename) { @@ -1139,16 +1169,16 @@ namespace chaiscript } else { process_hex(); } - } else if (is_unicode) { + } else if (unicode_size > 0) { if (is_hex_char) { hex_matches.push_back(t_char); - if(hex_matches.size() == 4) { - // Format is specified to be 'slash'uABCD - // on collecting from A to D do parsing - process_unicode(); - } - return; + if(hex_matches.size() == unicode_size) { + // Format is specified to be 'slash'uABCD + // on collecting from A to D do parsing + process_unicode(); + } + return; } else { // Not a unicode anymore, try parsing any way // May be someone used 'slash'uAA only @@ -1171,7 +1201,9 @@ namespace chaiscript } else if (t_char == 'x') { is_hex = true; } else if (t_char == 'u') { - is_unicode = true; + unicode_size = 4; + } else if (t_char == 'U') { + unicode_size = 8; } else { switch (t_char) { case ('\'') : match.push_back('\''); break; diff --git a/unittests/compiled_tests.cpp b/unittests/compiled_tests.cpp index a39c3f8d..3df1dbe2 100644 --- a/unittests/compiled_tests.cpp +++ b/unittests/compiled_tests.cpp @@ -4,7 +4,7 @@ #ifdef _MSC_VER #pragma warning(push) -#pragma warning(disable : 4062 4242 4640 4702 6330 28251) +#pragma warning(disable : 4062 4242 4566 4640 4702 6330 28251) #endif @@ -1271,6 +1271,16 @@ TEST_CASE("Test reference member being registered") CHECK(d == Approx(2.3)); } +TEST_CASE("Test unicode matches C++") +{ + chaiscript::ChaiScript_Basic chai(create_chaiscript_stdlib(),create_chaiscript_parser()); + CHECK(u8"\U000000AC" == chai.eval(R"("\U000000AC")")); + CHECK("\xF0\x9F\x8D\x8C" == chai.eval(R"("\xF0\x9F\x8D\x8C")")); + CHECK(u8"\U0001F34C" == chai.eval(R"("\U0001F34C")")); + CHECK(u8"\u2022" == chai.eval(R"("\u2022")")); + +} + const int add_3(const int &i) { diff --git a/unittests/string_unicode_parse.chai b/unittests/string_unicode_parse.chai index 50da68bb..ae2f247e 100644 --- a/unittests/string_unicode_parse.chai +++ b/unittests/string_unicode_parse.chai @@ -1,11 +1,4 @@ -assert_equal('\u00aa', '\u00AA') -assert_equal('\u00bb', '\uBB') -assert_equal('\ucc', '\u00CC') -assert_equal('\udd', '\uDD') +assert_equal("\u00aa", "\u00AA") +assert_equal("\u00bb", "\xC2\xBB") -assert_equal('\u0ee', '\uEE') -assert_equal('\ue', '\u000E') - -assert_equal("\u30\u31\u32", "012") -assert_equal("\u33Test", "3Test") assert_equal("Test\u0040", "Test@") diff --git a/unittests/string_unicode_unicode.chai b/unittests/string_unicode_unicode.chai index 2a237ed8..93364f05 100644 --- a/unittests/string_unicode_unicode.chai +++ b/unittests/string_unicode_unicode.chai @@ -1,5 +1,16 @@ -assert_equal("\uc39c", "Ü") -assert_equal("U for \uc39cmlauts", "U for Ümlauts") -assert_equal("More \uc39cml\uc3a4\uc3bcts", "More Ümläüts") +assert_equal("\uc39c", "쎜") +assert_equal("U for \u00dcmlauts", "U for Ümlauts") -assert_equal("Thorn \uc3be sign", "Thorn þ sign") +assert_equal("Thorn \u00fe sign", "Thorn þ sign") +assert_equal("Test\u0020Me", "Test Me") +assert_equal("Test\u2022Me", "Test•Me") + +assert_equal("\xF0\x9F\x8D\x8C", "🍌") +assert_equal("\U0001F34C", "🍌") + +assert_throws("Invalid 16 bit universal character", fun(){ parse("\"\\uD83C\""); }); +assert_throws("Incomplete unicode escape sequence", fun(){ parse("\"\\uD83 \""); }); + +assert_equal("\U00024B62", "𤭢") + +assert_equal("Test\U0001F534Me", "Test🔴Me")