#include #include #include #include #include struct utf8 { char mask; /* char data will be bitwise ANDed with this binary number */ char start; /* start index of the bytes of current char in a utf-8 encoded character */ uint32_t begin; /* beginning of codepoint range */ uint32_t end; /* end of codepoint range */ uint32_t bits_stored; /* the number of bits from the codepoint that fit in the char */ }; const std::vector utf8s = { /* mask start begin end bits */ utf8{ 0b00111111, static_cast(0b10000000), 0, 0, 6 }, utf8{ 0b01111111, static_cast(0b00000000), 0000, 0177, 7 }, utf8{ 0b00011111, static_cast(0b11000000), 0200, 03777, 5 }, utf8{ 0b00001111, static_cast(0b11100000), 04000, 0177777, 4 }, utf8{ 0b00000111, static_cast(0b11110000), 0200000, 04177777, 3 } }; uint32_t codepoint_size(const uint32_t& codepoint) { uint32_t size = 0; for ( const utf8& utf : utf8s ) { if ( ( codepoint >= utf.begin ) && ( codepoint <= utf.end ) ) { break; } size++; } return size; } uint32_t utf8_size(const char& ch) { uint32_t size = 0; for ( const utf8& utf : utf8s ) { if( ( ch & ~utf.mask ) == utf.start ) { break; } size++; } return size; } std::vector to_utf8(const uint32_t& codepoint) { const uint32_t byte_count = codepoint_size(codepoint); std::vector result{ }; uint32_t shift = utf8s[0].bits_stored * ( byte_count - 1 ); result.emplace_back(( codepoint >> shift & utf8s[byte_count].mask ) | utf8s[byte_count].start); shift -= utf8s[0].bits_stored; for ( uint32_t i = 1; i < byte_count; ++i ) { result.emplace_back(( codepoint >> shift & utf8s[0].mask ) | utf8s[0].start); shift -= utf8s[0].bits_stored; } return result; } uint32_t to_codepoint(const std::vector& chars) { const uint32_t byte_count = utf8_size(chars[0]); uint32_t shift = utf8s[0].bits_stored * ( byte_count - 1 ); uint32_t codepoint = ( chars[0] & utf8s[byte_count].mask ) << shift; for ( uint32_t index = 1; index < byte_count; ++index ) { shift -= utf8s[0].bits_stored; codepoint |= ( chars[index] & utf8s[0].mask ) << shift; } return codepoint; } int main() { const std::vector tests = { 0x0041, 0x00f6, 0x0416, 0x20ac, 0x1d11e }; std::cout << "Character Unicode UTF-8 encoding (hex)" << "\n"; std::cout << "------------------------------------------" << "\n"; for ( const uint32_t& test : tests ) { std::wstring_convert, char32_t> convert; std::cout << convert.to_bytes(test) << "\t" << " "; const std::vector utf8 = to_utf8(test); const uint32_t codepoint = to_codepoint(utf8); std::cout << "U+" << std::setw(4) << std::setfill('0') << std::hex << codepoint << "\t"; for ( uint32_t i = 0; i < utf8.size(); ++i ) { std::cout << std::setw(2) << ( utf8[i] & 0xff ) << " "; } std::cout << "\n"; } }