Fix #477: Handle \u unicode escape sequences in JSON parser

Convert \u escape sequences to proper UTF-8 characters instead of
passing through the literal \u notation. Supports the full BMP range
with correct 1, 2, 3, and 4-byte UTF-8 encoding.

Based on PR #483 by @olikraus, rebased onto current develop.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
olikraus 2026-04-10 19:49:48 -06:00 committed by leftibot
parent f59eff9b2f
commit bcf2fdbf50

View File

@ -460,17 +460,35 @@ namespace chaiscript::json {
val += '\t';
break;
case 'u': {
val += "\\u";
std::string hex_matches;
for (size_t i = 1; i <= 4; ++i) {
c = str.at(offset + i);
if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
val += c;
hex_matches += c;
} else {
throw std::runtime_error(
std::string("JSON ERROR: String: Expected hex character in unicode escape, found '") + c + "'");
}
}
offset += 4;
const auto ch = static_cast<uint32_t>(std::stoi(hex_matches, nullptr, 16));
if (ch < 0x80) {
val += static_cast<char>(ch);
} else if (ch < 0x800) {
val += static_cast<char>(0xC0 | (ch >> 6));
val += static_cast<char>(0x80 | (ch & 0x3F));
} else if (ch < 0x10000) {
val += static_cast<char>(0xE0 | (ch >> 12));
val += static_cast<char>(0x80 | ((ch >> 6) & 0x3F));
val += static_cast<char>(0x80 | (ch & 0x3F));
} else if (ch < 0x200000) {
val += static_cast<char>(0xF0 | (ch >> 18));
val += static_cast<char>(0x80 | ((ch >> 12) & 0x3F));
val += static_cast<char>(0x80 | ((ch >> 6) & 0x3F));
val += static_cast<char>(0x80 | (ch & 0x3F));
} else {
throw std::runtime_error(std::string("JSON ERROR: String: Invalid 32 bit universal character"));
}
} break;
default:
val += '\\';