From 9dc97269201603dd479e15a736a64479a5095556 Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Sun, 19 Dec 2021 19:51:38 +0100 Subject: Simplify utf_iterator for input, build on Debian 10+11, Ubuntu 2004-2204 --- include/unicode.h | 85 ++++++++++++++++++++++++------------------------------- 1 file changed, 37 insertions(+), 48 deletions(-) (limited to 'include/unicode.h') diff --git a/include/unicode.h b/include/unicode.h index 6b6f21a..6d8aac5 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -28,7 +28,7 @@ namespace unicode { // usually, char32_t, uint32_t etc. template - static inline bool is_valid_unicode(const T& value) + static inline bool is_valid_unicode(const T& value) noexcept { return value <= 0xD7FF || (value >= 0xE000 && value <= 0x10FFFF); } @@ -55,19 +55,18 @@ namespace unicode::detail { utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend): iterator(cbegin), end_iterator(cend) { - calculate_value(); } utf_iterator(const utf_iterator& other) = default; utf_iterator& operator=(const utf_iterator& other) = default; - size_t remaining_code_units() const + size_t remaining_code_units() const noexcept { return std::distance(iterator, end_iterator); } template - T get_code_unit() const + T get_code_unit() const noexcept { if constexpr (std::is_same>::value) { // std::list doesn't support it + n @@ -79,46 +78,49 @@ namespace unicode::detail { } } - inline static bool is_continuation_byte(T b) + inline static bool is_continuation_byte(T b) noexcept { return (b & 0b11000000) == 0b10000000; } template - inline static bool is_continuation_byte(T b, Targs... Fargs) + inline static bool is_continuation_byte(T b, Targs... Fargs) noexcept { return is_continuation_byte(b) && is_continuation_byte(Fargs...); } template - inline static bool is_byte0_of(T b) + inline static bool is_byte0_of(T b) noexcept { return (b & static_cast(0xFF << (7 - n))) == static_cast(0xFF << (8 - n)); } - inline static char32_t continuation_value(T b) + inline static char32_t continuation_value(T b) noexcept { return static_cast(b & 0b00111111); } template - inline static char32_t continuation_value(T b, Targs... Fargs) + inline static char32_t continuation_value(T b, Targs... Fargs) noexcept { return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...); } template - inline static char32_t value_byte0_of(T b) + inline static char32_t value_byte0_of(T b) noexcept { return static_cast(b & (0b1111111 >> n)) << ((n - 1) * 6); } - void calculate_value_utf8() + template::type = true> + inline value_type calculate_value() { size_t remaining{remaining_code_units()}; if (!remaining) - return; + return {}; + + value_type value{}; utf8_t byte0 {static_cast(get_code_unit<0>())}; if (byte0 & 0x80) { // 2-4 bytes @@ -126,17 +128,17 @@ namespace unicode::detail { utf8_t byte1 {static_cast(get_code_unit<1>())}; if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes value = value_byte0_of<2>(byte0) | continuation_value(byte1); - sequence_length = 2; + std::advance(iterator, 2); } else if (remaining >= 3) { utf8_t byte2 {static_cast(get_code_unit<2>())}; if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2); - sequence_length = 3; + std::advance(iterator, 3); } else if (remaining >= 4) { utf8_t byte3 {static_cast(get_code_unit<3>())}; if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3); - sequence_length = 4; + std::advance(iterator, 4); } else throw std::invalid_argument("Bad input: Invalid 4 byte sequence"); } else @@ -152,22 +154,25 @@ namespace unicode::detail { } else { // 1 byte: 7 bit ASCII value = byte0; - sequence_length = 1; + std::advance(iterator, 1); } + + return value; } - void calculate_value_utf16() + template::type = true> + inline value_type calculate_value() { size_t remaining{remaining_code_units()}; if (!remaining) - return; + return {}; char16_t unit0 {static_cast(get_code_unit<0>())}; if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane) - value = unit0; - sequence_length = 1; + std::advance(iterator, 1); + return unit0; } else { if (remaining < 2) throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); @@ -176,45 +181,32 @@ namespace unicode::detail { if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); - value = (static_cast(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000; - sequence_length = 2; + std::advance(iterator, 2); + return (static_cast(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000; } } - void calculate_value_utf32() + template::type = true> + inline value_type calculate_value() { size_t remaining{remaining_code_units()}; if (!remaining) - return; + return {}; - value = static_cast(get_code_unit<0>()); - - if (!unicode::is_valid_unicode(value)) - throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(value))); + value_type result {static_cast(get_code_unit<0>())}; - sequence_length = 1; - } + if (!unicode::is_valid_unicode(result)) + throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(result))); - // set value member - void calculate_value() - { - static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); + std::advance(iterator, 1); - if constexpr(sizeof(T) == 1) { - calculate_value_utf8(); - } else if constexpr (sizeof(T) == 2) { - calculate_value_utf16(); - } else if constexpr (sizeof(T) == 4) { - calculate_value_utf32(); - } + return result; } // pre-increment utf_iterator& operator++() { - std::advance(iterator, sequence_length); - calculate_value(); return *this; } @@ -223,17 +215,14 @@ namespace unicode::detail { return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator); } - reference operator*() + value_type operator*() { - return value; + return calculate_value(); } private: typename string_type::const_iterator iterator; typename string_type::const_iterator end_iterator; - - char32_t value{}; // always save complete unicode code point at this point - size_t sequence_length{}; }; template> -- cgit v1.2.3