#include <iostream>
#include <string>
#include <vector>
std::u32string utf8_to_utf32(const std::string& str) {
std::u32string result;
size_t i = 0;
while (i < str.size()) {
uint32_t codepoint = 0;
unsigned char c = str[i];
if ((c & 0x80) == 0) { // 1-byte (ASCII)
codepoint = c;
i += 1;
} else if ((c & 0xE0) == 0xC0) { // 2-byte
codepoint = ((c & 0x1F) << 6) | (str[i + 1] & 0x3F);
i += 2;
} else if ((c & 0xF0) == 0xE0) { // 3-byte
codepoint = ((c & 0x0F) << 12) | ((str[i + 1] & 0x3F) << 6) | (str[i + 2] & 0x3F);
i += 3;
} else if ((c & 0xF8) == 0xF0) { // 4-byte
codepoint = ((c & 0x07) << 18) | ((str[i + 1] & 0x3F) << 12) |
((str[i + 2] & 0x3F) << 6) | (str[i + 3] & 0x3F);
i += 4;
} else {
throw std::runtime_error("Invalid UTF-8 sequence");
}
result.push_back(codepoint);
}
return result;
}
std::wstring utf32_to_utf16(const std::u32string& str) {
std::wstring result;
for (char32_t codepoint : str) {
if (codepoint <= 0xFFFF) { // BMP (Basic Multilingual Plane)
result.push_back(static_cast<wchar_t>(codepoint));
} else { // Surrogate pair needed
codepoint -= 0x10000;
wchar_t high_surrogate = static_cast<wchar_t>((codepoint >> 10) + 0xD800);
wchar_t low_surrogate = static_cast<wchar_t>((codepoint & 0x3FF) + 0xDC00);
result.push_back(high_surrogate);
result.push_back(low_surrogate);
}
}
return result;
}
int main() {
std::string utf8_text = "A – B";
std::u32string unicode_text = utf8_to_utf32(utf8_text); // UTF-8 → UTF-32
std::wstring wide_text = utf32_to_utf16(unicode_text); // UTF-32 → UTF-16
std::wcout << L"UTF-16 wstring: " << wide_text << std::endl;
return 0;
}