Browse Source

[common] use handrolled string utility conversions instead of wstring codecvt

Signed-off-by: lizzie <lizzie@eden-emu.dev>
lizzie/wstring-uni-123
lizzie 2 days ago
parent
commit
71e713483f
  1. 114
      src/common/string_util.cpp

114
src/common/string_util.cpp

@ -116,20 +116,122 @@ std::string ReplaceAll(std::string result, const std::string& src, const std::st
} }
std::string UTF16ToUTF8(std::u16string_view input) { std::string UTF16ToUTF8(std::u16string_view input) {
std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert;
return convert.to_bytes(input.data(), input.data() + input.size());
std::string result;
result.reserve(input.size() * 4);
for (size_t i = 0; i < input.size(); ++i) {
uint32_t code = input[i];
// Handle surrogate pairs
if (code >= 0xD800 && code <= 0xDBFF) {
if (i + 1 < input.size()) {
uint32_t low = input[i + 1];
if (low >= 0xDC00 && low <= 0xDFFF) {
code = ((code - 0xD800) << 10) + (low - 0xDC00) + 0x10000;
++i;
}
}
}
if (code <= 0x7F) {
result.push_back(char(code));
} else if (code <= 0x7FF) {
result.push_back(char(0xC0 | (code >> 6)));
result.push_back(char(0x80 | (code & 0x3F)));
} else if (code <= 0xFFFF) {
result.push_back(char(0xE0 | (code >> 12)));
result.push_back(char(0x80 | ((code >> 6) & 0x3F)));
result.push_back(char(0x80 | (code & 0x3F)));
} else {
result.push_back(char(0xF0 | (code >> 18)));
result.push_back(char(0x80 | ((code >> 12) & 0x3F)));
result.push_back(char(0x80 | ((code >> 6) & 0x3F)));
result.push_back(char(0x80 | (code & 0x3F)));
}
}
return result;
} }
std::u16string UTF8ToUTF16(std::string_view input) { std::u16string UTF8ToUTF16(std::string_view input) {
std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert;
return convert.from_bytes(input.data(), input.data() + input.size());
std::u16string result;
result.reserve(input.size() * 2);
for (size_t i = 0; i < input.size(); ) {
uint32_t code = 0;
unsigned char c = input[i];
size_t len = 0;
if ((c & 0x80) == 0) {
code = c;
len = 1;
} else if ((c & 0xE0) == 0xC0) {
code = c & 0x1F;
len = 2;
} else if ((c & 0xF0) == 0xE0) {
code = c & 0x0F;
len = 3;
} else if ((c & 0xF8) == 0xF0) {
code = c & 0x07;
len = 4;
} else {
++i;
continue;
}
if (i + len - 1 >= input.size())
break;
for (size_t j = 1; j <= len - 1; ++j) {
if ((input[i + j] & 0xC0) != 0x80) {
code = 0xFFFD;
break;
}
code = (code << 6) | (input[i + j] & 0x3F);
}
if (code <= 0xFFFF) {
result.push_back(char16_t(code));
} else {
code -= 0x10000;
result.push_back(char16_t(0xD800 + (code >> 10)));
result.push_back(char16_t(0xDC00 + (code & 0x3FF)));
}
i += len;
}
return result;
} }
std::u32string UTF8ToUTF32(std::string_view input) { std::u32string UTF8ToUTF32(std::string_view input) {
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> convert;
return convert.from_bytes(input.data(), input.data() + input.size());
std::u32string result;
result.reserve(input.size());
for (size_t i = 0; i < input.size(); ) {
uint32_t code = 0;
unsigned char c = input[i];
size_t len = 0;
if ((c & 0x80) == 0) {
code = c;
len = 1;
} else if ((c & 0xE0) == 0xC0) {
code = c & 0x1F;
len = 2;
} else if ((c & 0xF0) == 0xE0) {
code = c & 0x0F;
len = 3;
} else if ((c & 0xF8) == 0xF0) {
code = c & 0x07;
len = 4;
} else {
++i;
continue;
}
if (i + len - 1 >= input.size())
break;
for (size_t j = 1; j <= len - 1; ++j) {
if ((input[i + j] & 0xC0) != 0x80) {
code = 0xFFFD;
break;
}
code = (code << 6) | (input[i + j] & 0x3F);
}
result.push_back(code);
i += len;
}
return result;
} }
#ifdef _WIN32 #ifdef _WIN32
static std::wstring CPToUTF16(u32 code_page, std::string_view input) { static std::wstring CPToUTF16(u32 code_page, std::string_view input) {
const auto size = const auto size =

Loading…
Cancel
Save