Merge pull request #418 from ChaiScript/apply_unicode_patches

Apply unicode patches
2026-01-01 03:12:23 +08:00 · 2018-05-08 09:08:25 -06:00 · 2018-05-08 09:08:25 -06:00 · 06191646d2
commit 06191646d2
parent 9a670d79fc 258cb23dda
4 changed files with 78 additions and 32 deletions
--- a/include/chaiscript/language/chaiscript_parser.hpp
+++ b/include/chaiscript/language/chaiscript_parser.hpp
@ -1038,7 +1038,7 @@ namespace chaiscript
        bool saw_interpolation_marker = false;
        bool is_octal = false;
        bool is_hex = false;
-        bool is_unicode = false;
+        std::size_t unicode_size = 0;
        const bool interpolation_allowed;

        string_type octal_matches;
@ -1062,12 +1062,12 @@ namespace chaiscript
              process_hex();
            }

-            if (is_unicode) {
+            if (unicode_size > 0) {
              process_unicode();
            }
          } catch (const std::invalid_argument &) {
-            // escape sequence was invalid somehow, we'll pick this
-            // up in the next part of parsing
+          } catch (const exception::eval_error &) {
+            // Something happened with parsing, we'll catch it later?
          }
        }

@ -1097,13 +1097,43 @@ namespace chaiscript

        void process_unicode()
        {
-          if (!hex_matches.empty()) {
-            auto val = stoll(hex_matches, nullptr, 16);
-            hex_matches.clear();
-            match += detail::Char_Parser_Helper<string_type>::str_from_ll(val);
-          }
+          const auto ch = static_cast<uint32_t>(std::stoi(hex_matches, nullptr, 16));
+          const auto match_size = hex_matches.size();
+          hex_matches.clear();
          is_escaped = false;
-          is_unicode = false;
+          const auto u_size = unicode_size;
+          unicode_size = 0;
+
+          char buf[4];
+          if (u_size != match_size) {
+            throw exception::eval_error("Incomplete unicode escape sequence");
+          }
+          if (u_size == 4 && ch >= 0xD800 && ch <= 0xDFFF) {
+            throw exception::eval_error("Invalid 16 bit universal character");
+          }
+
+
+          if (ch < 0x80) {
+            match += static_cast<char>(ch);
+          } else if (ch < 0x800) {
+            buf[0] = static_cast<char>(0xC0 | (ch >> 6));
+            buf[1] = static_cast<char>(0x80 | (ch & 0x3F));
+            match.append(buf, 2);
+          } else if (ch < 0x10000) {
+            buf[0] = static_cast<char>(0xE0 |  (ch >> 12));
+            buf[1] = static_cast<char>(0x80 | ((ch >>  6) & 0x3F));
+            buf[2] = static_cast<char>(0x80 |  (ch        & 0x3F));
+            match.append(buf, 3);
+          } else if (ch < 0x200000) {
+            buf[0] = static_cast<char>(0xF0 |  (ch >> 18));
+            buf[1] = static_cast<char>(0x80 | ((ch >> 12) & 0x3F));
+            buf[2] = static_cast<char>(0x80 | ((ch >>  6) & 0x3F));
+            buf[3] = static_cast<char>(0x80 |  (ch        & 0x3F));
+            match.append(buf, 4);
+          } else {
+            // this must be an invalid escape sequence?
+            throw exception::eval_error("Invalid 32 bit universal character");
+          }
        }

        void parse(const char_type t_char, const int line, const int col, const std::string &filename) {
@ -1139,16 +1169,16 @@ namespace chaiscript
            } else {
              process_hex();
            }
-          } else if (is_unicode) {
+          } else if (unicode_size > 0) {
            if (is_hex_char) {
              hex_matches.push_back(t_char);

-            if(hex_matches.size() == 4) {
-              // Format is specified to be 'slash'uABCD
-              // on collecting from A to D do parsing
-              process_unicode();
-            }
-            return;
+              if(hex_matches.size() == unicode_size) {
+                // Format is specified to be 'slash'uABCD
+                // on collecting from A to D do parsing
+                process_unicode();
+              }
+              return;
            } else {
              // Not a unicode anymore, try parsing any way
              // May be someone used 'slash'uAA only
@ -1171,7 +1201,9 @@ namespace chaiscript
              } else if (t_char == 'x') {
                is_hex = true;
              } else if (t_char == 'u') {
-                is_unicode = true;
+                unicode_size = 4;
+              } else if (t_char == 'U') {
+                unicode_size = 8;
              } else {
                switch (t_char) {
                  case ('\'') : match.push_back('\''); break;
--- a/unittests/compiled_tests.cpp
+++ b/unittests/compiled_tests.cpp
@ -4,7 +4,7 @@

 #ifdef _MSC_VER
 #pragma warning(push)
-#pragma warning(disable : 4062 4242 4640 4702 6330 28251)
+#pragma warning(disable : 4062 4242 4566 4640 4702 6330 28251)
 #endif


@ -1271,6 +1271,16 @@ TEST_CASE("Test reference member being registered")
  CHECK(d == Approx(2.3));
 }

+TEST_CASE("Test unicode matches C++")
+{
+  chaiscript::ChaiScript_Basic chai(create_chaiscript_stdlib(),create_chaiscript_parser());
+  CHECK(u8"\U000000AC" == chai.eval<std::string>(R"("\U000000AC")"));
+  CHECK("\xF0\x9F\x8D\x8C" == chai.eval<std::string>(R"("\xF0\x9F\x8D\x8C")"));
+  CHECK(u8"\U0001F34C" == chai.eval<std::string>(R"("\U0001F34C")"));
+  CHECK(u8"\u2022" == chai.eval<std::string>(R"("\u2022")"));
+
+}
+

 const int add_3(const int &i)
 {
--- a/unittests/string_unicode_parse.chai
+++ b/unittests/string_unicode_parse.chai
@ -1,11 +1,4 @@
-assert_equal('\u00aa', '\u00AA')
-assert_equal('\u00bb', '\uBB')
-assert_equal('\ucc', '\u00CC')
-assert_equal('\udd', '\uDD')
+assert_equal("\u00aa", "\u00AA")
+assert_equal("\u00bb", "\xC2\xBB")

-assert_equal('\u0ee', '\uEE')
-assert_equal('\ue', '\u000E')
-
-assert_equal("\u30\u31\u32", "012")
-assert_equal("\u33Test", "3Test")
 assert_equal("Test\u0040", "Test@")
--- a/unittests/string_unicode_unicode.chai
+++ b/unittests/string_unicode_unicode.chai
@ -1,5 +1,16 @@
-assert_equal("\uc39c", "Ü")
-assert_equal("U for \uc39cmlauts", "U for Ümlauts")
-assert_equal("More \uc39cml\uc3a4\uc3bcts", "More Ümläüts")
+assert_equal("\uc39c", "쎜")
+assert_equal("U for \u00dcmlauts", "U for Ümlauts")

-assert_equal("Thorn \uc3be sign", "Thorn þ sign")
+assert_equal("Thorn \u00fe sign", "Thorn þ sign")
+assert_equal("Test\u0020Me", "Test Me")
+assert_equal("Test\u2022Me", "Test•Me")
+
+assert_equal("\xF0\x9F\x8D\x8C", "🍌")
+assert_equal("\U0001F34C", "🍌")
+
+assert_throws("Invalid 16 bit universal character", fun(){ parse("\"\\uD83C\""); });
+assert_throws("Incomplete unicode escape sequence", fun(){ parse("\"\\uD83 \""); });
+
+assert_equal("\U00024B62", "𤭢")
+
+assert_equal("Test\U0001F534Me", "Test🔴Me")