diff --git a/src/common/string_util.cpp b/src/common/string_util.cpp index 7bcbe737b6..7d6cb2fa79 100644 --- a/src/common/string_util.cpp +++ b/src/common/string_util.cpp @@ -116,20 +116,122 @@ std::string ReplaceAll(std::string result, const std::string& src, const std::st } std::string UTF16ToUTF8(std::u16string_view input) { - std::wstring_convert, char16_t> convert; - return convert.to_bytes(input.data(), input.data() + input.size()); + std::string result; + result.reserve(input.size() * 4); + for (size_t i = 0; i < input.size(); ++i) { + uint32_t code = input[i]; + // Handle surrogate pairs + if (code >= 0xD800 && code <= 0xDBFF) { + if (i + 1 < input.size()) { + uint32_t low = input[i + 1]; + if (low >= 0xDC00 && low <= 0xDFFF) { + code = ((code - 0xD800) << 10) + (low - 0xDC00) + 0x10000; + ++i; + } + } + } + if (code <= 0x7F) { + result.push_back(char(code)); + } else if (code <= 0x7FF) { + result.push_back(char(0xC0 | (code >> 6))); + result.push_back(char(0x80 | (code & 0x3F))); + } else if (code <= 0xFFFF) { + result.push_back(char(0xE0 | (code >> 12))); + result.push_back(char(0x80 | ((code >> 6) & 0x3F))); + result.push_back(char(0x80 | (code & 0x3F))); + } else { + result.push_back(char(0xF0 | (code >> 18))); + result.push_back(char(0x80 | ((code >> 12) & 0x3F))); + result.push_back(char(0x80 | ((code >> 6) & 0x3F))); + result.push_back(char(0x80 | (code & 0x3F))); + } + } + return result; } std::u16string UTF8ToUTF16(std::string_view input) { - std::wstring_convert, char16_t> convert; - return convert.from_bytes(input.data(), input.data() + input.size()); + std::u16string result; + result.reserve(input.size() * 2); + for (size_t i = 0; i < input.size(); ) { + uint32_t code = 0; + unsigned char c = input[i]; + size_t len = 0; + if ((c & 0x80) == 0) { + code = c; + len = 1; + } else if ((c & 0xE0) == 0xC0) { + code = c & 0x1F; + len = 2; + } else if ((c & 0xF0) == 0xE0) { + code = c & 0x0F; + len = 3; + } else if ((c & 0xF8) == 0xF0) { + code = c & 0x07; + len = 4; + } else { + ++i; + continue; + } + if (i + len - 1 >= input.size()) + break; + for (size_t j = 1; j <= len - 1; ++j) { + if ((input[i + j] & 0xC0) != 0x80) { + code = 0xFFFD; + break; + } + code = (code << 6) | (input[i + j] & 0x3F); + } + if (code <= 0xFFFF) { + result.push_back(char16_t(code)); + } else { + code -= 0x10000; + result.push_back(char16_t(0xD800 + (code >> 10))); + result.push_back(char16_t(0xDC00 + (code & 0x3FF))); + } + i += len; + } + return result; } std::u32string UTF8ToUTF32(std::string_view input) { - std::wstring_convert, char32_t> convert; - return convert.from_bytes(input.data(), input.data() + input.size()); + std::u32string result; + result.reserve(input.size()); + for (size_t i = 0; i < input.size(); ) { + uint32_t code = 0; + unsigned char c = input[i]; + size_t len = 0; + if ((c & 0x80) == 0) { + code = c; + len = 1; + } else if ((c & 0xE0) == 0xC0) { + code = c & 0x1F; + len = 2; + } else if ((c & 0xF0) == 0xE0) { + code = c & 0x0F; + len = 3; + } else if ((c & 0xF8) == 0xF0) { + code = c & 0x07; + len = 4; + } else { + ++i; + continue; + } + if (i + len - 1 >= input.size()) + break; + for (size_t j = 1; j <= len - 1; ++j) { + if ((input[i + j] & 0xC0) != 0x80) { + code = 0xFFFD; + break; + } + code = (code << 6) | (input[i + j] & 0x3F); + } + result.push_back(code); + i += len; + } + return result; } + #ifdef _WIN32 static std::wstring CPToUTF16(u32 code_page, std::string_view input) { const auto size =