From 918d015302a004755ce0cf4968793cdf6a61bca8 Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Mon, 25 Jan 2021 18:54:25 +0100 Subject: Add first working conversion UTF-8 -> UTF-16 --- include/unicode.h | 73 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 57 insertions(+), 16 deletions(-) (limited to 'include/unicode.h') diff --git a/include/unicode.h b/include/unicode.h index b90ed15..512891a 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -4,8 +4,15 @@ #pragma once #include +#include #include +#ifdef __has_cpp_attribute +#if __has_cpp_attribute(__cpp_char8_t) +// char8_t available +#endif +#endif + namespace { struct utf8_iterator @@ -13,22 +20,57 @@ namespace { typedef char32_t value_type; typedef char32_t& reference; - void get_value() + utf8_iterator(const std::u8string::const_iterator& cbegin, const std::u8string::const_iterator& cend): + iterator(cbegin), end_iterator(cend) { - // TODO: set value to current data in *iterator ... - value = 'X'; + calculate_value(); } - size_t get_number_of_utf8_bytes() + utf8_iterator(const utf8_iterator& other) = default; + utf8_iterator& operator=(const utf8_iterator& other) = default; + + // set value member + void calculate_value() { - // TODO: how many bytes - return 1; + if (iterator == end_iterator) + return; + + char8_t first_byte {*iterator}; + if (first_byte & 0x80) { // 2-4 bytes + if (iterator + 1 != end_iterator) { + char8_t second_byte {*(iterator + 1)}; + if ((first_byte & 0b11100000) == 0b11000000 && (second_byte & 0b11000000) == 0b10000000) { // 2 bytes + value = char32_t(first_byte & 0b11111) << 6 | (second_byte & 0b111111); + sequence_length = 2; + } else if (iterator + 2 != end_iterator) { + char8_t third_byte {*(iterator + 2)}; + if ((first_byte & 0b11110000) == 0b11100000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000) { // 3 bytes + value = char32_t(first_byte & 0b1111) << 12 | char32_t(second_byte & 0b111111) << 6 | (third_byte & 0b111111); + sequence_length = 3; + } else if (iterator + 3 != end_iterator) { + char8_t fourth_byte {*(iterator + 3)}; + if ((first_byte & 0b11111000) == 0b11110000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000 && (fourth_byte & 0b11000000) == 0b10000000) { // 4 bytes + value = char32_t(first_byte & 0b111) << 18 | char32_t(second_byte & 0b111111) << 12 | char32_t(third_byte & 0b111111) << 6 | (fourth_byte & 0b111111); + sequence_length = 4; + } else + throw std::invalid_argument("bad input: invalid 4 byte sequence"); + } else + throw std::invalid_argument("bad input: invalid 3 byte sequence"); + } else + throw std::invalid_argument("bad input: invalid 2 byte sequence"); + } else + throw std::invalid_argument("bad input: byte 2 expected, none found"); + } else { // 1 byte: 7 bit ASCII + value = first_byte; + sequence_length = 1; + } } // pre-increment utf8_iterator& operator++() { - iterator += get_number_of_utf8_bytes(); + iterator += sequence_length; + calculate_value(); return *this; } @@ -39,14 +81,14 @@ namespace { reference operator*() { - get_value(); return value; } std::u8string::const_iterator iterator; - std::u8string::const_iterator end_iterator; + value_type value{}; + size_t sequence_length{}; }; struct utf16_back_insert_iterator @@ -70,7 +112,12 @@ namespace { // append utf-16 word sequence reference operator=(const char32_t& value) { - s.push_back(0); // TODO + if (value <= 0xFFFF) { // expect value to be already valid Unicode values, TODO: validate char32_t! + s.push_back(value); + } else { + s.push_back((value >> 10) + 0xD800); + s.push_back((value & 0x3FF) + 0xDC00); + } return *this; } @@ -96,12 +143,6 @@ namespace { namespace unicode { -// returns number of bytes in UTF-8 byte sequence of first found code point, -// if found. 0 if none found or sequence empty. -//size_t utf8_start() -//{ -//} - std::u16string utf8_to_utf16(const std::u8string& s) { std::u16string result; -- cgit v1.2.3