From 2b27deb54fec75ed529776f30be8eeb4ea239257 Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Tue, 28 Dec 2021 16:10:33 +0100 Subject: Refactoring UTF-8 decoding, bugfixing --- include/unicode.h | 51 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/unicode.h b/include/unicode.h index 4e4c7eb..2d7bf71 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -1,6 +1,6 @@ // libunicode // -// Author: Roland Reichwein +// Author: Roland Reichwein // // Available under the conditions of CC0 1.0 Universal // https://creativecommons.org/publicdomain/zero/1.0/ @@ -104,11 +104,11 @@ namespace unicode::detail { template inline bool is_utf16_sequence(value_type word0, Twords... words) noexcept { - constexpr auto n{sizeof...(Twords) + 1}; + constexpr auto sequence_length{sizeof...(Twords) + 1}; - static_assert(n <= 2, "UTF-16 sequences of only 1 or 2 code units are supported"); + static_assert(sequence_length <= 2, "UTF-16 sequences of only 1 or 2 code units are supported"); - if constexpr(n == 1) { + if constexpr(sequence_length == 1) { return is_valid_unicode(word0); } else { char16_t unit0 {static_cast(word0)}; @@ -143,22 +143,35 @@ namespace unicode::detail { return true; } + template + inline char32_t decode_utf8_leading_byte(value_type b) noexcept + { + return static_cast(b & (0b1111111 >> sequence_length)) << ((sequence_length - 1) * 6); + } + template - inline char32_t continuation_value(value_type b) noexcept + inline char32_t decode_utf8_followup_byte(value_type b) noexcept { return static_cast(b & 0b00111111); } template - inline char32_t continuation_value(value_type b, Targs... Fargs) noexcept + inline char32_t decode_utf8_followup_byte(value_type b, Targs... bytes) noexcept { - return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...); + return decode_utf8_followup_byte(b) << (6 * sizeof...(Targs)) | decode_utf8_followup_byte(bytes...); } - template - inline char32_t value_byte0_of(value_type b) noexcept + template + inline char32_t decode_utf8_sequence(value_type b, Targs... bytes) noexcept { - return static_cast(b & (0b1111111 >> n)) << ((n - 1) * 6); + size_t constexpr sequence_length{sizeof...(Targs) + 1}; + + static_assert(sequence_length <= 4); + + if constexpr (sequence_length == 1) + return b; + else + return decode_utf8_leading_byte(b) | decode_utf8_followup_byte(bytes...); } template> @@ -206,32 +219,32 @@ namespace unicode::detail { utf8_t byte0 {static_cast(get_code_unit<0>())}; if (is_utf8_sequence(byte0)) { // 1 byte: 7 bit ASCII std::advance(iterator, 1); - return byte0; + return decode_utf8_sequence(byte0); } else { internal_type value{}; if (size_t remaining{remaining_code_units()}; remaining >= 2) { utf8_t byte1 {static_cast(get_code_unit<1>())}; if (is_utf8_sequence(byte0, byte1)) { // 2 bytes - value = value_byte0_of<2>(byte0) | continuation_value(byte1); + value = decode_utf8_sequence(byte0, byte1); std::advance(iterator, 2); } else if (remaining >= 3) { utf8_t byte2 {static_cast(get_code_unit<2>())}; if (is_utf8_sequence(byte0, byte1, byte2)) { // 3 bytes - value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2); + value = decode_utf8_sequence(byte0, byte1, byte2); std::advance(iterator, 3); } else if (remaining >= 4) { utf8_t byte3 {static_cast(get_code_unit<3>())}; if (is_utf8_sequence(byte0, byte1, byte2, byte3)) { // 4 bytes - value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3); + value = decode_utf8_sequence(byte0, byte1, byte2, byte3); std::advance(iterator, 4); } else throw std::invalid_argument("Bad input: Invalid 4 byte sequence"); } else - throw std::invalid_argument("Bad input: Invalid 3 byte sequence"); + throw std::invalid_argument("Bad UTF-8 input: 4th byte expected, none found (end of sequence)"); } else - throw std::invalid_argument("Bad input: Invalid 2 byte sequence"); + throw std::invalid_argument("Bad UTF-8 input: 3rd byte expected, none found (end of sequence)"); } else - throw std::invalid_argument("Bad input: 2nd byte expected, none found"); + throw std::invalid_argument("Bad UTF-8 input: 2nd byte expected, none found (end of sequence)"); // check only for sequences >= 2 bytes (ASCII is always compliant) if (!unicode::is_valid_unicode(value)) @@ -782,13 +795,13 @@ namespace unicode { typename To::string_type convert(const typename From::string_type& s) { // if input type == output type, only validate and return input, is appropriate - if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) == 1 && + if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) && std::is_same_v, utf_back_insert_iterator>> && std::is_same_v, utf_back_insert_iterator>>) { if (validate_utf(s)) { return s; } else { - throw std::invalid_argument("Invalid UTF-8"); + throw std::invalid_argument("Invalid UTF input"); } } if constexpr(accu_size == 4 || accu_size == 8) { return convert_optimized(s); -- cgit v1.2.3