diff --git a/include/chaiscript/utility/unicode.hpp b/include/chaiscript/utility/unicode.hpp new file mode 100644 index 00000000..294278ad --- /dev/null +++ b/include/chaiscript/utility/unicode.hpp @@ -0,0 +1,91 @@ +// This file is distributed under the BSD License. +// See "license.txt" for details. +// http://www.chaiscript.com + +#ifndef CHAISCRIPT_UTILITY_UNICODE_HPP_ +#define CHAISCRIPT_UTILITY_UNICODE_HPP_ + +#include +#include +#include + +namespace chaiscript { + namespace utility { + namespace unicode { + + inline constexpr std::uint32_t max_codepoint = 0x10FFFF; + + constexpr bool is_surrogate(std::uint32_t cp) noexcept { return cp >= 0xD800 && cp <= 0xDFFF; } + + // Append cp to out as UTF-8. Returns bytes written, or 0 if cp >= 0x200000. + // Surrogates are not rejected here; callers that care check is_surrogate() first. + inline std::size_t append_utf8(std::string &out, std::uint32_t cp) { + if (cp < 0x80) { + out += static_cast(cp); + return 1; + } + if (cp < 0x800) { + out += static_cast(0xC0 | (cp >> 6)); + out += static_cast(0x80 | (cp & 0x3F)); + return 2; + } + if (cp < 0x10000) { + out += static_cast(0xE0 | (cp >> 12)); + out += static_cast(0x80 | ((cp >> 6) & 0x3F)); + out += static_cast(0x80 | (cp & 0x3F)); + return 3; + } + if (cp < 0x200000) { + out += static_cast(0xF0 | (cp >> 18)); + out += static_cast(0x80 | ((cp >> 12) & 0x3F)); + out += static_cast(0x80 | ((cp >> 6) & 0x3F)); + out += static_cast(0x80 | (cp & 0x3F)); + return 4; + } + return 0; + } + + // Append cp to out as UTF-16. Returns code units written, or 0 if cp is + // a surrogate or > max_codepoint. + template + inline std::size_t append_utf16(std::basic_string &out, std::uint32_t cp) { + if (is_surrogate(cp) || cp > max_codepoint) { + return 0; + } + if (cp < 0x10000) { + out += static_cast(cp); + return 1; + } + const std::uint32_t v = cp - 0x10000; + out += static_cast(0xD800 | (v >> 10)); + out += static_cast(0xDC00 | (v & 0x3FF)); + return 2; + } + + // Append cp to a basic_string. Dispatches on sizeof(CharT): + // 1 byte -> UTF-8, 2 bytes -> UTF-16, 4 bytes -> UTF-32. + // Returns code units written, or 0 if the codepoint is invalid. + template + inline std::size_t append_codepoint(std::basic_string &out, std::uint32_t cp) { + if constexpr (sizeof(CharT) == 1) { + std::string tmp; + const auto n = append_utf8(tmp, cp); + out.append(tmp.begin(), tmp.end()); + return n; + } else if constexpr (sizeof(CharT) == 2) { + return append_utf16(out, cp); + } else { + static_assert(sizeof(CharT) == 4, "append_codepoint: unsupported CharT size"); + if (is_surrogate(cp) || cp > max_codepoint) { + return 0; + } + out += static_cast(cp); + return 1; + } + } + + } // namespace unicode + } // namespace utility +} // namespace chaiscript + +#endif