diff options
author | Roland Reichwein <mail@reichwein.it> | 2021-12-28 16:10:33 +0100 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2021-12-28 16:10:33 +0100 |
commit | 2b27deb54fec75ed529776f30be8eeb4ea239257 (patch) | |
tree | 4415fee92e9def0fadb6679098ec6a07f125c1ba /include | |
parent | 563557be9c97496b7435bef4e64730a379e55037 (diff) |
Refactoring UTF-8 decoding, bugfixing
Diffstat (limited to 'include')
-rw-r--r-- | include/unicode.h | 51 |
1 files changed, 32 insertions, 19 deletions
diff --git a/include/unicode.h b/include/unicode.h index 4e4c7eb..2d7bf71 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -1,6 +1,6 @@ // libunicode // -// Author: Roland Reichwein +// Author: Roland Reichwein <mail@reichwein.it> // // Available under the conditions of CC0 1.0 Universal // https://creativecommons.org/publicdomain/zero/1.0/ @@ -104,11 +104,11 @@ namespace unicode::detail { template<typename value_type, typename... Twords> inline bool is_utf16_sequence(value_type word0, Twords... words) noexcept { - constexpr auto n{sizeof...(Twords) + 1}; + constexpr auto sequence_length{sizeof...(Twords) + 1}; - static_assert(n <= 2, "UTF-16 sequences of only 1 or 2 code units are supported"); + static_assert(sequence_length <= 2, "UTF-16 sequences of only 1 or 2 code units are supported"); - if constexpr(n == 1) { + if constexpr(sequence_length == 1) { return is_valid_unicode(word0); } else { char16_t unit0 {static_cast<char16_t>(word0)}; @@ -143,22 +143,35 @@ namespace unicode::detail { return true; } + template<size_t sequence_length, typename value_type> + inline char32_t decode_utf8_leading_byte(value_type b) noexcept + { + return static_cast<char32_t>(b & (0b1111111 >> sequence_length)) << ((sequence_length - 1) * 6); + } + template<typename value_type> - inline char32_t continuation_value(value_type b) noexcept + inline char32_t decode_utf8_followup_byte(value_type b) noexcept { return static_cast<char32_t>(b & 0b00111111); } template<typename value_type, typename... Targs> - inline char32_t continuation_value(value_type b, Targs... Fargs) noexcept + inline char32_t decode_utf8_followup_byte(value_type b, Targs... bytes) noexcept { - return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...); + return decode_utf8_followup_byte(b) << (6 * sizeof...(Targs)) | decode_utf8_followup_byte(bytes...); } - template<size_t n, typename value_type> - inline char32_t value_byte0_of(value_type b) noexcept + template<typename value_type, typename... Targs> + inline char32_t decode_utf8_sequence(value_type b, Targs... bytes) noexcept { - return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6); + size_t constexpr sequence_length{sizeof...(Targs) + 1}; + + static_assert(sequence_length <= 4); + + if constexpr (sequence_length == 1) + return b; + else + return decode_utf8_leading_byte<sequence_length>(b) | decode_utf8_followup_byte(bytes...); } template<typename T, typename Container=std::basic_string<T>> @@ -206,32 +219,32 @@ namespace unicode::detail { utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())}; if (is_utf8_sequence(byte0)) { // 1 byte: 7 bit ASCII std::advance(iterator, 1); - return byte0; + return decode_utf8_sequence(byte0); } else { internal_type value{}; if (size_t remaining{remaining_code_units()}; remaining >= 2) { utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())}; if (is_utf8_sequence(byte0, byte1)) { // 2 bytes - value = value_byte0_of<2>(byte0) | continuation_value(byte1); + value = decode_utf8_sequence(byte0, byte1); std::advance(iterator, 2); } else if (remaining >= 3) { utf8_t byte2 {static_cast<utf8_t>(get_code_unit<2>())}; if (is_utf8_sequence(byte0, byte1, byte2)) { // 3 bytes - value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2); + value = decode_utf8_sequence(byte0, byte1, byte2); std::advance(iterator, 3); } else if (remaining >= 4) { utf8_t byte3 {static_cast<utf8_t>(get_code_unit<3>())}; if (is_utf8_sequence(byte0, byte1, byte2, byte3)) { // 4 bytes - value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3); + value = decode_utf8_sequence(byte0, byte1, byte2, byte3); std::advance(iterator, 4); } else throw std::invalid_argument("Bad input: Invalid 4 byte sequence"); } else - throw std::invalid_argument("Bad input: Invalid 3 byte sequence"); + throw std::invalid_argument("Bad UTF-8 input: 4th byte expected, none found (end of sequence)"); } else - throw std::invalid_argument("Bad input: Invalid 2 byte sequence"); + throw std::invalid_argument("Bad UTF-8 input: 3rd byte expected, none found (end of sequence)"); } else - throw std::invalid_argument("Bad input: 2nd byte expected, none found"); + throw std::invalid_argument("Bad UTF-8 input: 2nd byte expected, none found (end of sequence)"); // check only for sequences >= 2 bytes (ASCII is always compliant) if (!unicode::is_valid_unicode(value)) @@ -782,13 +795,13 @@ namespace unicode { typename To::string_type convert(const typename From::string_type& s) { // if input type == output type, only validate and return input, is appropriate - if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) == 1 && + if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) && std::is_same_v<From, UTF<utf_iterator<typename From::value_type>, utf_back_insert_iterator<typename From::value_type>>> && std::is_same_v<To, UTF<utf_iterator<typename To::value_type>, utf_back_insert_iterator<typename To::value_type>>>) { if (validate_utf<typename From::value_type>(s)) { return s; } else { - throw std::invalid_argument("Invalid UTF-8"); + throw std::invalid_argument("Invalid UTF input"); } } if constexpr(accu_size == 4 || accu_size == 8) { return convert_optimized<From, To>(s); |