From fad8b697dff7c7b47f034124ea6eef25e74bd7af Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Tue, 26 Jan 2021 22:05:08 +0100 Subject: Implement conversion and first tests --- include/unicode.h | 257 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 211 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/include/unicode.h b/include/unicode.h index 512891a..a55eac3 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -15,66 +15,164 @@ namespace { - struct utf8_iterator + using namespace std::string_literals; + + template + struct utf_iterator { typedef char32_t value_type; typedef char32_t& reference; + typedef std::basic_string string_type; - utf8_iterator(const std::u8string::const_iterator& cbegin, const std::u8string::const_iterator& cend): + utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend): iterator(cbegin), end_iterator(cend) { - calculate_value(); + calculate_value(); + } + + utf_iterator(const utf_iterator& other) = default; + utf_iterator& operator=(const utf_iterator& other) = default; + + size_t remaining_code_units() + { + return end_iterator - iterator; } - utf8_iterator(const utf8_iterator& other) = default; - utf8_iterator& operator=(const utf8_iterator& other) = default; + template + T get_code_unit() + { + return *(iterator + index); + } // set value member + // default: char32_t for UTF-32 + // specializations for UTF-8 and UTF-16 below + template void calculate_value() { - if (iterator == end_iterator) + size_t remaining{remaining_code_units()}; + + if (!remaining) + return; + + value = get_code_unit<0>(); + + if (value > 0x10FFFF || (value > 0xD7FF && value < 0xE000)) + throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(value))); + + sequence_length = 1; + } + + inline static bool is_continuation_byte(T b) + { + return (b & 0b11000000) == 0b10000000; + } + + template + inline static bool is_continuation_byte(T b, Targs... Fargs) + { + return is_continuation_byte(b) && is_continuation_byte(Fargs...); + } + + template + inline static bool is_byte0_of(T b) + { + return (b & static_cast(0xFF << (7 - n))) == static_cast(0xFF << (8 - n)); + } + + inline static char32_t continuation_value(T b) + { + return static_cast(b & 0b00111111); + } + + template + inline static char32_t continuation_value(T b, Targs... Fargs) + { + return continuation_value(b) << 6 | continuation_value(Fargs...); + } + + template + inline static char32_t value_byte0_of(T b) + { + return static_cast(b & (0b1111111 >> n)) << ((n - 1) * 6); + } + + // specialization for UTF-8 + template<> + void calculate_value() + { + size_t remaining{remaining_code_units()}; + + if (!remaining) return; - char8_t first_byte {*iterator}; - if (first_byte & 0x80) { // 2-4 bytes - if (iterator + 1 != end_iterator) { - char8_t second_byte {*(iterator + 1)}; - if ((first_byte & 0b11100000) == 0b11000000 && (second_byte & 0b11000000) == 0b10000000) { // 2 bytes - value = char32_t(first_byte & 0b11111) << 6 | (second_byte & 0b111111); + char8_t byte0 {get_code_unit<0>()}; + if (byte0 & 0x80) { // 2-4 bytes + if (remaining >= 2) { + char8_t byte1 {get_code_unit<1>()}; + if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes + value = value_byte0_of<2>(byte0) | continuation_value(byte1); sequence_length = 2; - } else if (iterator + 2 != end_iterator) { - char8_t third_byte {*(iterator + 2)}; - if ((first_byte & 0b11110000) == 0b11100000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000) { // 3 bytes - value = char32_t(first_byte & 0b1111) << 12 | char32_t(second_byte & 0b111111) << 6 | (third_byte & 0b111111); + } else if (remaining >= 3) { + char8_t byte2 {get_code_unit<2>()}; + if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes + value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2); sequence_length = 3; - } else if (iterator + 3 != end_iterator) { - char8_t fourth_byte {*(iterator + 3)}; - if ((first_byte & 0b11111000) == 0b11110000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000 && (fourth_byte & 0b11000000) == 0b10000000) { // 4 bytes - value = char32_t(first_byte & 0b111) << 18 | char32_t(second_byte & 0b111111) << 12 | char32_t(third_byte & 0b111111) << 6 | (fourth_byte & 0b111111); + } else if (remaining >= 4) { + char8_t byte3 {get_code_unit<3>()}; + if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes + value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3); sequence_length = 4; } else - throw std::invalid_argument("bad input: invalid 4 byte sequence"); + throw std::invalid_argument("Bad input: Invalid 4 byte sequence"); } else - throw std::invalid_argument("bad input: invalid 3 byte sequence"); + throw std::invalid_argument("Bad input: Invalid 3 byte sequence"); } else - throw std::invalid_argument("bad input: invalid 2 byte sequence"); + throw std::invalid_argument("Bad input: Invalid 2 byte sequence"); } else - throw std::invalid_argument("bad input: byte 2 expected, none found"); + throw std::invalid_argument("Bad input: 2nd byte expected, none found"); } else { // 1 byte: 7 bit ASCII - value = first_byte; + value = byte0; sequence_length = 1; } } + // specialization for UTF-16 + template<> + void calculate_value() + { + size_t remaining{remaining_code_units()}; + + if (!remaining) + return; + + char16_t unit0 {get_code_unit<0>()}; + + if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane) + value = unit0; + sequence_length = 1; + } else { + if (remaining < 2) + throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); + + char16_t unit1 {get_code_unit<1>()}; + if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) + throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); + + value = static_cast(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF); + sequence_length = 2; + } + } + // pre-increment - utf8_iterator& operator++() + utf_iterator& operator++() { iterator += sequence_length; - calculate_value(); + calculate_value(); return *this; } - bool operator!=(const utf8_iterator& other) const + bool operator!=(const utf_iterator& other) const { return iterator != other.iterator; } @@ -84,21 +182,23 @@ namespace { return value; } - std::u8string::const_iterator iterator; - std::u8string::const_iterator end_iterator; + typename string_type::const_iterator iterator; + typename string_type::const_iterator end_iterator; value_type value{}; size_t sequence_length{}; }; - struct utf16_back_insert_iterator + template + struct utf_back_insert_iterator { - typedef utf16_back_insert_iterator& reference; + typedef std::basic_string string_type; + typedef utf_back_insert_iterator& reference; - utf16_back_insert_iterator(std::u16string& s): s(s) {} + utf_back_insert_iterator(string_type& s): s(s) {} // no-op - utf16_back_insert_iterator& operator++() + utf_back_insert_iterator& operator++() { return *this; } @@ -109,10 +209,71 @@ namespace { return *this; } - // append utf-16 word sequence + // default: utf-32 code unit for UTF-32 + // specializations for UTF-8 and UTF-16 below + template reference operator=(const char32_t& value) { - if (value <= 0xFFFF) { // expect value to be already valid Unicode values, TODO: validate char32_t! + // expect value to be already valid Unicode values + s.push_back(value); + return *this; + } + + // n is number of UTF-8 bytes in sequence + template + inline static T byte0_of(char32_t value) + { + return (value >> 6 * (n - 1)) | (0xFF << (8 - n)); + } + + // n is index of 6-bit groups, counting from bit 0 + template + inline static T trailing_byte(char32_t value) + { + return ((value >> n * 6) & 0b111111) | 0b10000000; + } + + // calculate UTF-8 sequence byte for m >= 2 bytes sequences (i.e. non-ASCII) + // assume value to be valid Unicode value for given byte position + template + inline static T byte_n_of_m(char32_t value) + { + if constexpr (n == 0) + return byte0_of(value); + else + return trailing_byte(value); + } + + // specialization for UTF-8 + // append utf-8 byte sequence + template<> + reference operator=(const char32_t& value) + { + if (value < 0x80) { // 1 byte + s.push_back(value); + } else if (value < 0x800) { // 2 bytes + s.push_back(byte_n_of_m<0,2>(value)); + s.push_back(byte_n_of_m<1,2>(value)); + } else if (value < 0x10000) { // 3 bytes + s.push_back(byte_n_of_m<0,3>(value)); + s.push_back(byte_n_of_m<1,3>(value)); + s.push_back(byte_n_of_m<2,3>(value)); + } else if (value < 0x110000) { // 4 bytes + s.push_back(byte_n_of_m<0,4>(value)); + s.push_back(byte_n_of_m<1,4>(value)); + s.push_back(byte_n_of_m<2,4>(value)); + s.push_back(byte_n_of_m<3,4>(value)); + } else + throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast(value))); + return *this; + } + + // specialization for UTF-16 + // append utf-16 word sequence + template<> + reference operator=(const char32_t& value) + { + if (value <= 0xFFFF) { // expect value to be already valid Unicode values s.push_back(value); } else { s.push_back((value >> 10) + 0xD800); @@ -121,33 +282,37 @@ namespace { return *this; } - std::u16string& s; + typename utf_back_insert_iterator::string_type& s; }; - utf16_back_insert_iterator utf16_back_inserter(std::u16string& s) + template + utf_back_insert_iterator utf_back_inserter(std::basic_string& s) { - return utf16_back_insert_iterator(s); + return utf_back_insert_iterator(s); } - utf8_iterator utf8_begin(const std::u8string& s) + template + utf_iterator utf_begin(const std::basic_string& s) { - return utf8_iterator{s.cbegin(), s.cend()}; + return utf_iterator{s.cbegin(), s.cend()}; } - utf8_iterator utf8_end(const std::u8string& s) + template + utf_iterator utf_end(const std::basic_string& s) { - return utf8_iterator{s.cend(), s.cend()}; + return utf_iterator{s.cend(), s.cend()}; } } // namespace namespace unicode { -std::u16string utf8_to_utf16(const std::u8string& s) +template +std::basic_string utf_to_utf(const std::basic_string& s) { - std::u16string result; + std::basic_string result; - std::copy(utf8_begin(s), utf8_end(s), utf16_back_inserter(result)); + std::copy(utf_begin(s), utf_end(s), utf_back_inserter(result)); return result; } -- cgit v1.2.3