Merge pull request #276 from ELynx/develop

Add UTF-8 escape notation to parser
2026-01-01 03:12:23 +08:00 · 2016-06-29 15:03:40 -06:00 · 2016-06-29 15:03:40 -06:00 · 176d608bb4
commit 176d608bb4
parent a1d90c95f0 5642e062e6
5 changed files with 117 additions and 4 deletions
--- a/include/chaiscript/chaiscript_defines.hpp
+++ b/include/chaiscript/chaiscript_defines.hpp
@ -60,6 +60,10 @@
 #define CHAISCRIPT_MODULE_EXPORT extern "C" 
 #endif

+#if defined(CHAISCRIPT_MSVC) || (defined(__GNUC__) && __GNUC__ >= 5) || defined(CHAISCRIPT_CLANG)
+#define CHAISCRIPT_UTF16_UTF32
+#endif
+
 #ifdef _DEBUG
 #define CHAISCRIPT_DEBUG true
 #else
--- a/include/chaiscript/language/chaiscript_parser.hpp
+++ b/include/chaiscript/language/chaiscript_parser.hpp
@ -16,6 +16,11 @@
 #include <cctype>
 #include <cstring>

+#if defined(CHAISCRIPT_UTF16_UTF32)
+#include <locale>
+#include <codecvt>
+#endif
+


 #include "../dispatchkit/boxed_value.hpp"
@ -55,6 +60,53 @@ namespace chaiscript
          ,   max_alphabet
          ,   lengthof_alphabet = 256
      };
+
+      // Generic for u16, u32 and wchar
+      template<typename string_type>
+      struct Char_Parser_Helper
+      {
+        // common for all implementations
+        static std::string u8str_from_ll(long long val)
+        {
+          typedef std::string::value_type char_type;
+
+          char_type c[2];
+          c[1] = char_type(val);
+          c[0] = char_type(val >> 8);
+
+          if (c[0] == 0)
+          {
+            return std::string(1, c[1]); // size, character
+          }
+
+          return std::string(c, 2); // char buffer, size
+        }
+
+        static string_type str_from_ll(long long val)
+        {
+          typedef typename string_type::value_type target_char_type;
+#if defined (CHAISCRIPT_UTF16_UTF32)
+          // prepare converter
+          std::wstring_convert<std::codecvt_utf8<target_char_type>, target_char_type> converter;
+          // convert
+          return converter.from_bytes(u8str_from_ll(val));
+#else
+          // no conversion available, just put value as character
+          return string_type(1, target_char_type(val)); // size, character
+#endif
+        }
+      };
+
+      // Specialization for char AKA UTF-8
+      template<>
+      struct Char_Parser_Helper<std::string>
+      {
+        static std::string str_from_ll(long long val)
+        {
+          // little SFINAE trick to avoid base class
+          return Char_Parser_Helper<std::true_type>::u8str_from_ll(val);
+        }
+      };
    }

    class ChaiScript_Parser {
@ -938,6 +990,7 @@ namespace chaiscript
        bool saw_interpolation_marker;
        bool is_octal;
        bool is_hex;
+        bool is_unicode;
        const bool interpolation_allowed;

        string_type octal_matches;
@ -950,6 +1003,7 @@ namespace chaiscript
            saw_interpolation_marker(false),
            is_octal(false),
            is_hex(false),
+            is_unicode(false),
            interpolation_allowed(t_interpolation_allowed)
        {
        }
@ -964,6 +1018,10 @@ namespace chaiscript
          if (is_hex) {
            process_hex();
          }
+
+          if (is_unicode) {
+            process_unicode();
+          }
        }

        void process_hex()
@ -985,9 +1043,23 @@ namespace chaiscript
          is_octal = false;
        }

+
+        void process_unicode()
+        {
+          auto val = stoll(hex_matches, 0, 16);
+          hex_matches.clear();
+          match += detail::Char_Parser_Helper<string_type>::str_from_ll(val);
+          is_escaped = false;
+          is_unicode = false;
+        }
+
        void parse(const char_type t_char, const int line, const int col, const std::string &filename) {
          const bool is_octal_char = t_char >= '0' && t_char <= '7';

+          const bool is_hex_char  = (t_char >= '0' && t_char <= '9')
+                                 || (t_char >= 'a' && t_char <= 'f')
+                                 || (t_char >= 'A' && t_char <= 'F');
+
          if (is_octal) {
            if (is_octal_char) {
              octal_matches.push_back(t_char);
@ -1000,10 +1072,6 @@ namespace chaiscript
              process_octal();
            }
          } else if (is_hex) {
-            const bool is_hex_char = (t_char >= '0' && t_char <= '9')
-                                  || (t_char >= 'a' && t_char <= 'f')
-                                  || (t_char >= 'A' && t_char <= 'F');
-
            if (is_hex_char) {
              hex_matches.push_back(t_char);

@ -1018,6 +1086,21 @@ namespace chaiscript
            } else {
              process_hex();
            }
+          } else if (is_unicode) {
+            if (is_hex_char) {
+              hex_matches.push_back(t_char);
+
+            if(hex_matches.size() == 4) {
+              // Format is specified to be 'slash'uABCD
+              // on collecting from A to D do parsing
+              process_unicode();
+            }
+            return;
+            } else {
+              // Not a unicode anymore, try parsing any way
+              // May be someone used 'slash'uAA only
+              process_unicode();
+            }
          }

          if (t_char == '\\') {
@ -1034,6 +1117,8 @@ namespace chaiscript
                octal_matches.push_back(t_char);
              } else if (t_char == 'x') {
                is_hex = true;
+              } else if (t_char == 'u') {
+                is_unicode = true;
              } else {
                switch (t_char) {
                  case ('\'') : match.push_back('\''); break;
--- a/unittests/string_unicode_ascii.chai
+++ b/unittests/string_unicode_ascii.chai
@ -0,0 +1,8 @@
+assert_equal('\u0020', ' ')
+assert_equal('\u0021', '!')
+assert_equal('\u0030', '0')
+assert_equal('\u0040', '@')
+assert_equal('\u005B', '[')
+assert_equal('\u005d', ']')
+assert_equal('\u0061', 'a')
+assert_equal('\u007e', '~')
--- a/unittests/string_unicode_parse.chai
+++ b/unittests/string_unicode_parse.chai
@ -0,0 +1,11 @@
+assert_equal('\u00aa', '\u00AA')
+assert_equal('\u00bb', '\uBB')
+assert_equal('\ucc', '\u00CC')
+assert_equal('\udd', '\uDD')
+
+assert_equal('\u0ee', '\uEE')
+assert_equal('\ue', '\u000E')
+
+assert_equal("\u30\u31\u32", "012")
+assert_equal("\u33Test", "3Test")
+assert_equal("Test\u0040", "Test@")
--- a/unittests/string_unicode_unicode.chai
+++ b/unittests/string_unicode_unicode.chai
@ -0,0 +1,5 @@
+assert_equal("\uc39c", "Ü")
+assert_equal("U for \uc39cmlauts", "U for Ümlauts")
+assert_equal("More \uc39cml\uc3a4\uc3bcts", "More Ümläüts")
+
+assert_equal("Thorn \uc3be sign", "Thorn þ sign")