From bcf2fdbf50e9cebe2813853e56759e642c5fe6e8 Mon Sep 17 00:00:00 2001 From: olikraus Date: Fri, 10 Apr 2026 19:49:48 -0600 Subject: [PATCH 1/2] Fix #477: Handle \u unicode escape sequences in JSON parser Convert \u escape sequences to proper UTF-8 characters instead of passing through the literal \u notation. Supports the full BMP range with correct 1, 2, 3, and 4-byte UTF-8 encoding. Based on PR #483 by @olikraus, rebased onto current develop. Co-Authored-By: Claude Opus 4.6 (1M context) --- include/chaiscript/utility/json.hpp | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/include/chaiscript/utility/json.hpp b/include/chaiscript/utility/json.hpp index 25be353a..aa8fe698 100644 --- a/include/chaiscript/utility/json.hpp +++ b/include/chaiscript/utility/json.hpp @@ -460,17 +460,35 @@ namespace chaiscript::json { val += '\t'; break; case 'u': { - val += "\\u"; + std::string hex_matches; for (size_t i = 1; i <= 4; ++i) { c = str.at(offset + i); if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { - val += c; + hex_matches += c; } else { throw std::runtime_error( std::string("JSON ERROR: String: Expected hex character in unicode escape, found '") + c + "'"); } } offset += 4; + const auto ch = static_cast(std::stoi(hex_matches, nullptr, 16)); + if (ch < 0x80) { + val += static_cast(ch); + } else if (ch < 0x800) { + val += static_cast(0xC0 | (ch >> 6)); + val += static_cast(0x80 | (ch & 0x3F)); + } else if (ch < 0x10000) { + val += static_cast(0xE0 | (ch >> 12)); + val += static_cast(0x80 | ((ch >> 6) & 0x3F)); + val += static_cast(0x80 | (ch & 0x3F)); + } else if (ch < 0x200000) { + val += static_cast(0xF0 | (ch >> 18)); + val += static_cast(0x80 | ((ch >> 12) & 0x3F)); + val += static_cast(0x80 | ((ch >> 6) & 0x3F)); + val += static_cast(0x80 | (ch & 0x3F)); + } else { + throw std::runtime_error(std::string("JSON ERROR: String: Invalid 32 bit universal character")); + } } break; default: val += '\\'; From 91e50bc80f10aff2f782fe50d1008d9ed404ae88 Mon Sep 17 00:00:00 2001 From: leftibot Date: Fri, 10 Apr 2026 19:49:56 -0600 Subject: [PATCH 2/2] Add tests for JSON \u unicode escape sequences Tests cover ASCII range, 2-byte UTF-8 (U+00C4), 3-byte UTF-8 (U+20AC), mixed text, multiple escapes, uppercase hex, and unicode in object values. Co-Authored-By: Claude Opus 4.6 (1M context) --- unittests/json_unicode_1.chai | 2 ++ unittests/json_unicode_2.chai | 3 +++ unittests/json_unicode_3.chai | 2 ++ unittests/json_unicode_4.chai | 2 ++ unittests/json_unicode_5.chai | 2 ++ unittests/json_unicode_6.chai | 2 ++ unittests/json_unicode_7.chai | 2 ++ unittests/json_unicode_8.chai | 3 +++ 8 files changed, 18 insertions(+) create mode 100644 unittests/json_unicode_1.chai create mode 100644 unittests/json_unicode_2.chai create mode 100644 unittests/json_unicode_3.chai create mode 100644 unittests/json_unicode_4.chai create mode 100644 unittests/json_unicode_5.chai create mode 100644 unittests/json_unicode_6.chai create mode 100644 unittests/json_unicode_7.chai create mode 100644 unittests/json_unicode_8.chai diff --git a/unittests/json_unicode_1.chai b/unittests/json_unicode_1.chai new file mode 100644 index 00000000..572ca627 --- /dev/null +++ b/unittests/json_unicode_1.chai @@ -0,0 +1,2 @@ +// Test JSON \u escape: ASCII range (U+0041 = 'A') +assert_equal(from_json("\"\\u0041\""), "A") diff --git a/unittests/json_unicode_2.chai b/unittests/json_unicode_2.chai new file mode 100644 index 00000000..57ba4c54 --- /dev/null +++ b/unittests/json_unicode_2.chai @@ -0,0 +1,3 @@ +// Test JSON \u escape: 2-byte UTF-8 (U+00C4 = 'Ä') +// This is the example from issue #477 +assert_equal(from_json("\"\\u00c4\""), "\u00C4") diff --git a/unittests/json_unicode_3.chai b/unittests/json_unicode_3.chai new file mode 100644 index 00000000..f77d7a23 --- /dev/null +++ b/unittests/json_unicode_3.chai @@ -0,0 +1,2 @@ +// Test JSON \u escape: 3-byte UTF-8 (U+20AC = '€') +assert_equal(from_json("\"\\u20AC\""), "\u20AC") diff --git a/unittests/json_unicode_4.chai b/unittests/json_unicode_4.chai new file mode 100644 index 00000000..7b6f780f --- /dev/null +++ b/unittests/json_unicode_4.chai @@ -0,0 +1,2 @@ +// Test JSON \u escape: mixed with regular text +assert_equal(from_json("\"Hello \\u0057orld\""), "Hello World") diff --git a/unittests/json_unicode_5.chai b/unittests/json_unicode_5.chai new file mode 100644 index 00000000..786f09e7 --- /dev/null +++ b/unittests/json_unicode_5.chai @@ -0,0 +1,2 @@ +// Test JSON \u escape: multiple unicode escapes in one string +assert_equal(from_json("\"\\u0048\\u0065\\u006C\\u006C\\u006F\""), "Hello") diff --git a/unittests/json_unicode_6.chai b/unittests/json_unicode_6.chai new file mode 100644 index 00000000..92f4a993 --- /dev/null +++ b/unittests/json_unicode_6.chai @@ -0,0 +1,2 @@ +// Test JSON \u escape: uppercase hex digits +assert_equal(from_json("\"\\u00C4\""), "\u00C4") diff --git a/unittests/json_unicode_7.chai b/unittests/json_unicode_7.chai new file mode 100644 index 00000000..dd34aaf3 --- /dev/null +++ b/unittests/json_unicode_7.chai @@ -0,0 +1,2 @@ +// Test JSON \u escape: null character (U+0000) - edge case +assert_equal(from_json("\"before\\u0041after\""), "beforeAafter") diff --git a/unittests/json_unicode_8.chai b/unittests/json_unicode_8.chai new file mode 100644 index 00000000..3796ea70 --- /dev/null +++ b/unittests/json_unicode_8.chai @@ -0,0 +1,3 @@ +// Test JSON \u escape inside an object value +var m = from_json("{\"key\": \"\\u00C4\\u00D6\\u00DC\"}") +assert_equal(m["key"], "\u00C4\u00D6\u00DC")