From a2d82ec6f02ec9183cee3b006515b2510fbad36f Mon Sep 17 00:00:00 2001 From: mutouyun Date: Tue, 7 Jan 2025 10:38:39 +0800 Subject: [PATCH] Add `codecvt` --- include/libipc/imp/codecvt.h | 36 +++ src/libipc/imp/codecvt.cpp | 365 ++++++++++++++++++++++++++++++ src/libipc/platform/win/codecvt.h | 95 ++++++++ test/imp/test_imp_codecvt.cpp | 40 ++++ 4 files changed, 536 insertions(+) create mode 100644 include/libipc/imp/codecvt.h create mode 100644 src/libipc/imp/codecvt.cpp create mode 100644 src/libipc/platform/win/codecvt.h create mode 100644 test/imp/test_imp_codecvt.cpp diff --git a/include/libipc/imp/codecvt.h b/include/libipc/imp/codecvt.h new file mode 100644 index 0000000..1526f1e --- /dev/null +++ b/include/libipc/imp/codecvt.h @@ -0,0 +1,36 @@ +/** + * \file libipc/codecvt.h + * \author mutouyun (orz@orzz.org) + * \brief Character set conversion interface. + */ +#pragma once + +#include +#include + +#include "libipc/imp/export.h" + +namespace ipc { + +/** + * \brief The transform between UTF-8/16/32 + * + * \param des The target string pointer can be nullptr + * \param dlen The target string length can be 0 + */ +template +IPC_EXPORT std::size_t cvt_cstr(CharT const *src, std::size_t slen, CharU *des, std::size_t dlen) noexcept; + +template +void cvt_sstr(std::basic_string const &src, std::basic_string &des) { + std::size_t dlen = cvt_cstr(src.c_str(), src.size(), (CharU *)nullptr, 0); + if (dlen == 0) { + des.clear(); + return; + } + des.resize(dlen); + cvt_cstr(src.c_str(), src.size(), &des[0], des.size()); +} + +} // namespace ipc diff --git a/src/libipc/imp/codecvt.cpp b/src/libipc/imp/codecvt.cpp new file mode 100644 index 0000000..bd34daa --- /dev/null +++ b/src/libipc/imp/codecvt.cpp @@ -0,0 +1,365 @@ + +#include +#include +#include +#include + +#include "libipc/imp/codecvt.h" +#include "libipc/imp/detect_plat.h" + +#if defined(LIBIPC_OS_WIN) +# include "libipc/platform/win/codecvt.h" +#endif + +namespace ipc { + +/** + * \brief The transform between local-character-set(UTF-8/GBK/...) and UTF-16/32. + * + * Modified from UnicodeConverter. + * Copyright (c) 2010. Jianhui Qin (http://blog.csdn.net/jhqin). + * + * \remarks codecvt_utf8_utf16/std::wstring_convert is deprecated. + * \see https://codingtidbit.com/2020/02/09/c17-codecvt_utf8-is-deprecated/ + * https://stackoverflow.com/questions/42946335/deprecated-header-codecvt-replacement + * https://en.cppreference.com/w/cpp/locale/codecvt/in +*/ +namespace { + +/// \brief X-bit unicode transformation format +enum class ufmt { + utf8, + utf16, + utf32, +}; + +template +struct utf_compatible : std::false_type {}; + +template +struct utf_compatible::value && (sizeof(T) == 1)>> : std::true_type {}; + +template +struct utf_compatible::value && (sizeof(T) == 2)>> : std::true_type {}; + +template +struct utf_compatible::value && (sizeof(T) == 4)>> : std::true_type {}; + +template +constexpr bool utf_compatible_v = utf_compatible::value; + +/** + * \brief UTF-32 --> UTF-8 + */ +template +auto cvt_char(T src, U* des, std::size_t dlen) noexcept + -> std::enable_if_t && + utf_compatible_v, std::size_t> { + if (src == 0) return 0; + + constexpr std::uint8_t prefix[] = { + 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC + }; + constexpr std::uint32_t codeup[] = { + 0x80, // U+00000000 - U+0000007F + 0x800, // U+00000080 - U+000007FF + 0x10000, // U+00000800 - U+0000FFFF + 0x200000, // U+00010000 - U+001FFFFF + 0x4000000, // U+00200000 - U+03FFFFFF + 0x80000000 // U+04000000 - U+7FFFFFFF + }; + + std::size_t i, len = sizeof(codeup) / sizeof(std::uint32_t); + for(i = 0; i < len; ++i) { + if (static_cast(src) < codeup[i]) break; + } + if (i == len) return 0; // the src is invalid + + len = i + 1; + if (des != nullptr) { + if (dlen > i) for (; i > 0; --i) { + des[i] = static_cast((src & 0x3F) | 0x80); + src >>= 6; + } + des[0] = static_cast(src | prefix[len - 1]); + } + return len; +} + +/** + * \brief UTF-8 --> UTF-32 + */ +template +auto cvt_char(T const *src, std::size_t slen, U &des) noexcept + -> std::enable_if_t && + utf_compatible_v, std::size_t> { + if ((src == nullptr) || (*src) == 0) return 0; + if (slen == 0) return 0; + + std::uint8_t b = (std::uint8_t)*(src++); + + if (b < 0x80) { + des = b; + return 1; + } + + if (b < 0xC0 || b > 0xFD) return 0; // the src is invalid + + std::size_t len; + if (b < 0xE0) { + des = b & 0x1F; + len = 2; + } else if (b < 0xF0) { + des = b & 0x0F; + len = 3; + } else if (b < 0xF8) { + des = b & 0x07; + len = 4; + } else if (b < 0xFC) { + des = b & 0x03; + len = 5; + } else { + des = b & 0x01; + len = 6; + } + + if (slen < len) return 0; + std::size_t i = 1; + for(; i < len; ++i) { + b = *(src++); + if ((b < 0x80) || (b > 0xBF)) return 0; // the src is invalid + des = (des << 6) + (b & 0x3F); + } + return len; +} + +/** + * \brief UTF-32 --> UTF-16 + */ +template +auto cvt_char(T src, U *des, std::size_t dlen) noexcept + -> std::enable_if_t && + utf_compatible_v, std::size_t> { + if (src == 0) return 0; + + if (src <= 0xFFFF) { + if ((des != nullptr) && (dlen != 0)) { + (*des) = static_cast(src); + } + return 1; + } else if (src <= 0xEFFFF) { + if ((des != nullptr) && (dlen > 1)) { + des[0] = static_cast(0xD800 + (src >> 10) - 0x40); // high + des[1] = static_cast(0xDC00 + (src & 0x03FF)); // low + } + return 2; + } + return 0; +} + +/** + * \brief UTF-16 --> UTF-32 +*/ +template +auto cvt_char(T const *src, std::size_t slen, U &des) + -> std::enable_if_t && + utf_compatible_v, std::size_t> { + if ((src == nullptr) || (*src) == 0) return 0; + if (slen == 0) return 0; + + std::uint16_t w1 = src[0]; + if ((w1 >= 0xD800) && (w1 <= 0xDFFF)) { + if (w1 < 0xDC00) { + if (slen < 2) return 0; + std::uint16_t w2 = src[1]; + if ((w2 >= 0xDC00) && (w2 <= 0xDFFF)) { + des = (w2 & 0x03FF) + (((w1 & 0x03FF) + 0x40) << 10); + return 2; + } + } + return 0; // the src is invalid + } + des = w1; + return 1; +} + +/** + * \brief UTF-16 --> UTF-8 +*/ +template +auto cvt_char(T src, U *des, std::size_t dlen) noexcept + -> std::enable_if_t && + utf_compatible_v, std::size_t> { + // make utf-16 to utf-32 + std::uint32_t tmp; + if (cvt_char(&src, 1, tmp) != 1) return 0; + // make utf-32 to utf-8 + return cvt_char(tmp, des, dlen); +} + +/** + * \brief UTF-8 --> UTF-16 +*/ +template +auto cvt_char(T const *src, std::size_t slen, U &des) + -> std::enable_if_t && + utf_compatible_v, std::size_t> { + // make utf-8 to utf-32 + std::uint32_t tmp; + std::size_t len = cvt_char(src, slen, tmp); + if (len == 0) return 0; + // make utf-32 to utf-16 + if (cvt_char(tmp, &des, 1) != 1) return 0; + return len; +} + +/** + * \brief UTF-32 string --> UTF-8/16 string +*/ +template +auto cvt_cstr_utf(T const *src, std::size_t slen, U *des, std::size_t dlen) noexcept + -> std::enable_if_t && + (utf_compatible_v || utf_compatible_v), std::size_t> { + if ((src == nullptr) || ((*src) == 0) || (slen == 0)) { + // source string is empty + return 0; + } + std::size_t num = 0, len = 0; + for (std::size_t i = 0; (i < slen) && ((*src) != 0); ++src, ++i) { + len = cvt_char(*src, des, dlen); + if (len == 0) return 0; + if (des != nullptr) { + des += len; + if (dlen < len) { + dlen = 0; + } else { + dlen -= len; + } + } + num += len; + } + return num; +} + +/** + * \brief UTF-8/16 string --> UTF-32 string +*/ +template +auto cvt_cstr_utf(T const *src, std::size_t slen, U *des, std::size_t dlen) noexcept + -> std::enable_if_t && + (utf_compatible_v || utf_compatible_v), std::size_t> { + if ((src == nullptr) || ((*src) == 0) || (slen == 0)) { + // source string is empty + return 0; + } + std::size_t num = 0; + for (std::size_t i = 0; (i < slen) && ((*src) != 0);) { + std::uint32_t tmp; + std::size_t len = cvt_char(src, slen - i, tmp); + if (len == 0) return 0; + if ((des != nullptr) && (dlen > 0)) { + (*des) = tmp; + ++des; + dlen -= 1; + } + src += len; + i += len; + num += 1; + } + return num; +} + +/** + * \brief UTF-8/16 string --> UTF-16/8 string +*/ +template +auto cvt_cstr_utf(T const *src, std::size_t slen, U *des, std::size_t dlen) noexcept + -> std::enable_if_t<(utf_compatible_v && utf_compatible_v) || + (utf_compatible_v && utf_compatible_v), std::size_t> { + if ((src == nullptr) || ((*src) == 0) || (slen == 0)) { + // source string is empty + return 0; + } + std::size_t num = 0; + for (std::size_t i = 0; (i < slen) && ((*src) != 0);) { + // make utf-x to utf-32 + std::uint32_t tmp; + std::size_t len = cvt_char(src, slen - i, tmp); + if (len == 0) return 0; + src += len; + i += len; + // make utf-32 to utf-y + len = cvt_char(tmp, des, dlen); + if (len == 0) return 0; + if (des != nullptr) { + des += len; + if (dlen < len) { + dlen = 0; + } else { + dlen -= len; + } + } + num += len; + } + return num; +} + +template +auto cvt_cstr_utf(T const *src, std::size_t slen, U *des, std::size_t dlen) noexcept + -> std::enable_if_t<(sizeof(T) == sizeof(U)), std::size_t> { + if ((des == nullptr) || (dlen == 0)) { + return slen; + } + std::size_t r = (std::min)(slen, dlen); + std::memcpy(des, src, r * sizeof(T)); + return r; +} + +} // namespace + +#define LIBIPC_DEF_CVT_CSTR_($CHAR_T, $CHAR_U) \ + template <> \ + std::size_t cvt_cstr($CHAR_T const *src, std::size_t slen, $CHAR_U *des, std::size_t dlen) noexcept { \ + return cvt_cstr_utf(src, slen, des, dlen); \ + } +// #define LIBIPC_DEF_CVT_CSTR_($CHAR_T, $CHAR_U) + +LIBIPC_DEF_CVT_CSTR_(char , char) +LIBIPC_DEF_CVT_CSTR_(char , char16_t) +LIBIPC_DEF_CVT_CSTR_(char , char32_t) +LIBIPC_DEF_CVT_CSTR_(wchar_t , wchar_t) +LIBIPC_DEF_CVT_CSTR_(char16_t, char16_t) +LIBIPC_DEF_CVT_CSTR_(char16_t, char) +LIBIPC_DEF_CVT_CSTR_(char16_t, char32_t) +LIBIPC_DEF_CVT_CSTR_(char32_t, char32_t) +LIBIPC_DEF_CVT_CSTR_(char32_t, char) +LIBIPC_DEF_CVT_CSTR_(char32_t, char16_t) +#if !defined(LIBIPC_OS_WIN) +LIBIPC_DEF_CVT_CSTR_(char , wchar_t) +LIBIPC_DEF_CVT_CSTR_(wchar_t , char) +LIBIPC_DEF_CVT_CSTR_(wchar_t , char16_t) +LIBIPC_DEF_CVT_CSTR_(wchar_t , char32_t) +LIBIPC_DEF_CVT_CSTR_(char16_t, wchar_t) +LIBIPC_DEF_CVT_CSTR_(char32_t, wchar_t) +#endif // !defined(LIBIPC_OS_WIN) + +#if defined(LIBIPC_CPP_20) +LIBIPC_DEF_CVT_CSTR_(char8_t , char8_t) +LIBIPC_DEF_CVT_CSTR_(char8_t , char) +LIBIPC_DEF_CVT_CSTR_(char8_t , char16_t) +LIBIPC_DEF_CVT_CSTR_(char8_t , char32_t) +LIBIPC_DEF_CVT_CSTR_(char , char8_t) +LIBIPC_DEF_CVT_CSTR_(char16_t, char8_t) +LIBIPC_DEF_CVT_CSTR_(char32_t, char8_t) +#if !defined(LIBIPC_OS_WIN) +LIBIPC_DEF_CVT_CSTR_(char8_t , wchar_t) +LIBIPC_DEF_CVT_CSTR_(wchar_t , char8_t) +#endif // !defined(LIBIPC_OS_WIN) +#endif // defined(LIBIPC_CPP_20) + +#undef LIBIPC_DEF_CVT_CSTR_ + +} // namespace ipc diff --git a/src/libipc/platform/win/codecvt.h b/src/libipc/platform/win/codecvt.h new file mode 100644 index 0000000..f617cc5 --- /dev/null +++ b/src/libipc/platform/win/codecvt.h @@ -0,0 +1,95 @@ +/** + * \file libipc/platform/win/codecvt.h + * \author mutouyun (orz@orzz.org) + */ +#pragma once + +#include + +#include "libipc/imp/codecvt.h" +#include "libipc/imp/detect_plat.h" + +namespace ipc { + +/** + * \see https://docs.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-multibytetowidechar + * https://docs.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-widechartomultibyte + * + * CP_ACP : The system default Windows ANSI code page. + * CP_MACCP : The current system Macintosh code page. + * CP_OEMCP : The current system OEM code page. + * CP_SYMBOL : Symbol code page (42). + * CP_THREAD_ACP: The Windows ANSI code page for the current thread. + * CP_UTF7 : UTF-7. Use this value only when forced by a 7-bit transport mechanism. Use of UTF-8 is preferred. + * CP_UTF8 : UTF-8. +*/ + +template <> +std::size_t cvt_cstr(char const *src, std::size_t slen, wchar_t *des, std::size_t dlen) noexcept { + if ((src == nullptr) || ((*src) == 0) || (slen == 0)) { + // source string is empty + return 0; + } + int cch_wc = (des == nullptr) ? 0 : (int)dlen; + int size_needed = ::MultiByteToWideChar(CP_ACP, 0, src, (int)slen, des, cch_wc); + if (size_needed <= 0) { + // failed: MultiByteToWideChar(CP_ACP). + return 0; + } + return size_needed; +} + +template <> +std::size_t cvt_cstr(wchar_t const *src, std::size_t slen, char *des, std::size_t dlen) noexcept { + if ((src == nullptr) || ((*src) == 0) || (slen == 0)) { + // source string is empty + return 0; + } + int cb_mb = (des == nullptr) ? 0 : (int)dlen; + int size_needed = ::WideCharToMultiByte(CP_ACP, 0, src, (int)slen, des, cb_mb, NULL, NULL); + if (size_needed <= 0) { + // failed: WideCharToMultiByte(CP_ACP). + return 0; + } + return size_needed; +} + +/** + * \brief Used for char8_t (since C++20) to wchar_t conversion. + * + * There is no ut to guarantee correctness (I'm a little lazy here), + * so if there are any bugs, please contact me in time. + */ +#if defined(LIBIMP_CPP_20) +template <> +std::size_t cvt_cstr(char8_t const *src, std::size_t slen, wchar_t *des, std::size_t dlen) noexcept { + if ((src == nullptr) || ((*src) == 0) || (slen == 0)) { + // source string is empty + return 0; + } + int cch_wc = (des == nullptr) ? 0 : (int)dlen; + int size_needed = ::MultiByteToWideChar(CP_UTF8, 0, (char *)src, (int)slen, des, cch_wc); + if (size_needed <= 0) { + // failed: MultiByteToWideChar(CP_UTF8). + return 0; + } + return size_needed; +} + +template <> +std::size_t cvt_cstr(wchar_t const *src, std::size_t slen, char8_t *des, std::size_t dlen) noexcept { + if ((src == nullptr) || ((*src) == 0) || (slen == 0)) { + // source string is empty + return 0; + } + int cb_mb = (des == nullptr) ? 0 : (int)dlen; + int size_needed = ::WideCharToMultiByte(CP_UTF8, 0, src, (int)slen, (char *)des, cb_mb, NULL, NULL); + if (size_needed <= 0) { + // failed: WideCharToMultiByte(CP_UTF8). + return 0; + } + return size_needed; +} +#endif // defined(LIBIMP_CPP_20) + +} // namespace ipc diff --git a/test/imp/test_imp_codecvt.cpp b/test/imp/test_imp_codecvt.cpp new file mode 100644 index 0000000..33b9cf7 --- /dev/null +++ b/test/imp/test_imp_codecvt.cpp @@ -0,0 +1,40 @@ + +#include +#include + +#include "test.h" + +#include "libipc/imp/codecvt.h" + +TEST(codecvt, cvt_cstr) { + char const utf8[] = "hello world, 你好,こんにちは"; + wchar_t const utf16[] = L"hello world, 你好,こんにちは"; + { + auto cvt_len = ipc::cvt_cstr(utf8, std::strlen(utf8), (wchar_t *)nullptr, 0); + EXPECT_NE(cvt_len, 0); + std::wstring wstr(cvt_len, L'\0'); + EXPECT_EQ(ipc::cvt_cstr(utf8, std::strlen(utf8), &wstr[0], wstr.size()), cvt_len); + EXPECT_EQ(wstr, utf16); + } + { + auto cvt_len = ipc::cvt_cstr(utf16, std::wcslen(utf16), (char *)nullptr, 0); + EXPECT_NE(cvt_len, 0); + std::string str(cvt_len, '\0'); + EXPECT_EQ(ipc::cvt_cstr(utf16, std::wcslen(utf16), &str[0], str.size()), cvt_len); + EXPECT_EQ(str, utf8); + } + { + auto cvt_len = ipc::cvt_cstr(utf8, std::strlen(utf8), (char *)nullptr, 0); + EXPECT_EQ(cvt_len, std::strlen(utf8)); + std::string str(cvt_len, '\0'); + EXPECT_EQ(ipc::cvt_cstr(utf8, cvt_len, &str[0], str.size()), cvt_len); + EXPECT_EQ(str, utf8); + } + { + auto cvt_len = ipc::cvt_cstr(utf16, std::wcslen(utf16), (wchar_t *)nullptr, 0); + EXPECT_EQ(cvt_len, std::wcslen(utf16)); + std::wstring wstr(cvt_len, u'\0'); + EXPECT_EQ(ipc::cvt_cstr(utf16, cvt_len, &wstr[0], wstr.size()), cvt_len); + EXPECT_EQ(wstr, utf16); + } +}