diff options
author | Roland Reichwein <mail@reichwein.it> | 2021-12-27 21:51:05 +0100 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2021-12-27 21:51:05 +0100 |
commit | 970ba4111160fbf78351b21a024c46c0978e0440 (patch) | |
tree | 300e5a70adde02999845aa05e1727b3510fd62aa /include | |
parent | d992304dc12f078f12eb971c6283e0b54054e6b1 (diff) |
Optimize UTF-8 validation
Diffstat (limited to 'include')
-rw-r--r-- | include/unicode.h | 210 |
1 files changed, 132 insertions, 78 deletions
diff --git a/include/unicode.h b/include/unicode.h index 395f172..4064233 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -47,6 +47,80 @@ namespace unicode::detail { using namespace std::string_literals; + template<typename value_type> + inline bool is_utf8_followup_byte(value_type b) noexcept + { + return (b & 0b11000000) == 0b10000000; + } + + template<size_t sequence_length, typename value_type> + inline bool is_utf8_leading_byte(value_type byte) noexcept + { + static_assert(sequence_length <= 4); + + if constexpr(sequence_length == 1) { + return !(byte & 0x80); + } else { + return (byte & static_cast<value_type>(0xFF << (7 - sequence_length))) == static_cast<value_type>(0xFF << (8 - sequence_length)); + } + } + + template<typename value_type, typename... Tbytes> + inline bool is_utf8_sequence(value_type byte0, Tbytes... bytes) noexcept + { + constexpr auto n{sizeof...(Tbytes) + 1}; + + static_assert(n <= 4); + + return is_utf8_leading_byte<n>(byte0) && + (is_utf8_followup_byte(bytes) && ...); + } + + template<typename T> + inline bool validate_utf8(const std::basic_string<T>& s) + { + static_assert(sizeof(T) == 1); + + int i{}; + auto size{s.size()}; + while (i < size) { + if (is_utf8_sequence(s[i])) { + i++; + } else if ((i < size - 1) && is_utf8_sequence(s[i], s[i + 1])) { + i += 2; + } else if ((i < size - 2) && is_utf8_sequence(s[i], s[i + 1], s[i + 2])) { + if (((s[i] & 0xF) == 0xD) && ((s[i + 1] & 0x20) == 0x20)) + return false; // Reserved for UTF-16 surrogates: 0xD800..0xDFFF + i += 3; + } else if ((i < size - 3) && is_utf8_sequence(s[i], s[i + 1], s[i + 2], s[i + 3])) { + if ((((s[i] & 7) << 2) | ((s[i + 1] >> 4) & 3)) >= 0x11) + return false; // Unicode too big above 0x10FFFF + i += 4; + } else { + return false; + } + } + return true; + } + + template<typename value_type> + inline char32_t continuation_value(value_type b) noexcept + { + return static_cast<char32_t>(b & 0b00111111); + } + + template<typename value_type, typename... Targs> + inline char32_t continuation_value(value_type b, Targs... Fargs) noexcept + { + return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...); + } + + template<size_t n, typename value_type> + inline char32_t value_byte0_of(value_type b) noexcept + { + return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6); + } + template<typename T, typename Container=std::basic_string<T>> struct utf_iterator { @@ -86,61 +160,28 @@ namespace unicode::detail { } } - inline static bool is_continuation_byte(value_type b) noexcept - { - return (b & 0b11000000) == 0b10000000; - } - - template<typename... Targs> - inline static bool is_continuation_byte(value_type b, Targs... Fargs) noexcept - { - return is_continuation_byte(b) && is_continuation_byte(Fargs...); - } - - template<size_t n> - inline static bool is_byte0_of(value_type b) noexcept - { - static_assert(n >= 2 && n <= 4); - - return (b & static_cast<value_type>(0xFF << (7 - n))) == static_cast<value_type>(0xFF << (8 - n)); - } - - inline static internal_type continuation_value(value_type b) noexcept - { - return static_cast<internal_type>(b & 0b00111111); - } - - template<typename... Targs> - inline static internal_type continuation_value(value_type b, Targs... Fargs) noexcept - { - return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...); - } - - template<size_t n> - inline static internal_type value_byte0_of(value_type b) noexcept - { - return static_cast<internal_type>(b & (0b1111111 >> n)) << ((n - 1) * 6); - } - template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true> inline internal_type calculate_value() { utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())}; - if (byte0 & 0x80) { // 2-4 bytes + if (is_utf8_sequence(byte0)) { // 1 byte: 7 bit ASCII + std::advance(iterator, 1); + return byte0; + } else { internal_type value{}; if (size_t remaining{remaining_code_units()}; remaining >= 2) { utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())}; - if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes + if (is_utf8_sequence(byte0, byte1)) { // 2 bytes value = value_byte0_of<2>(byte0) | continuation_value(byte1); std::advance(iterator, 2); } else if (remaining >= 3) { utf8_t byte2 {static_cast<utf8_t>(get_code_unit<2>())}; - if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes + if (is_utf8_sequence(byte0, byte1, byte2)) { // 3 bytes value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2); std::advance(iterator, 3); } else if (remaining >= 4) { utf8_t byte3 {static_cast<utf8_t>(get_code_unit<3>())}; - if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes + if (is_utf8_sequence(byte0, byte1, byte2, byte3)) { // 4 bytes value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3); std::advance(iterator, 4); } else @@ -157,9 +198,6 @@ namespace unicode::detail { throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value))); return value; - } else { // 1 byte: 7 bit ASCII - std::advance(iterator, 1); - return byte0; } } @@ -655,56 +693,72 @@ namespace unicode { // From and To are facets template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true> - typename To::string_type convert(const typename From::string_type& s) + typename To::string_type convert_optimized(const typename From::string_type& s) { typename To::string_type result; - - if constexpr(accu_size == 4 || accu_size == 8) { - typedef ConvertInputOptimizer<sizeof(typename From::value_type)> input_optimizer; - typedef ArchitectureOptimizer<accu_size, input_optimizer> arch_optimizer; - - auto begin{From::begin(s)}; - auto end{From::end(s)}; - auto back_inserter{To::back_inserter(result)}; - auto addr{reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])}; - while (input_distance_bytes(begin, end) >= accu_size) { - if (((uintptr_t)(void*)addr & arch_optimizer::addr_mask) == 0) { - while (input_distance_bytes(begin, end) >= accu_size) { - typename arch_optimizer::accu_type data{*addr}; - if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) { - arch_optimizer::template append(reinterpret_cast<const typename From::value_type*>(addr), result); - begin += accu_size / sizeof(typename From::value_type); - ++addr; - } else { - // just advance one code unit for now and break to trigger unoptimized - // version until next accu boundary - back_inserter = *begin; - ++begin; - break; - } + typedef ConvertInputOptimizer<sizeof(typename From::value_type)> input_optimizer; + typedef ArchitectureOptimizer<accu_size, input_optimizer> arch_optimizer; + + auto begin{From::begin(s)}; + auto end{From::end(s)}; + auto back_inserter{To::back_inserter(result)}; + auto addr{reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])}; + while (input_distance_bytes(begin, end) >= accu_size) { + if (((uintptr_t)(void*)addr & arch_optimizer::addr_mask) == 0) { + while (input_distance_bytes(begin, end) >= accu_size) { + typename arch_optimizer::accu_type data{*addr}; + if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) { + arch_optimizer::template append(reinterpret_cast<const typename From::value_type*>(addr), result); + begin += accu_size / sizeof(typename From::value_type); + ++addr; + } else { + // just advance one code unit for now and break to trigger unoptimized + // version until next accu boundary + back_inserter = *begin; + ++begin; + break; } } - - // keep up after unaligned Non-ASCII code points - while (begin != end && (uintptr_t)(void*)(addr = reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])) & arch_optimizer::addr_mask) { - back_inserter = *begin; - ++begin; - } } - // remainder < 8 bytes - while (begin != end) { + // keep up after unaligned Non-ASCII code points + while (begin != end && (uintptr_t)(void*)(addr = reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])) & arch_optimizer::addr_mask) { back_inserter = *begin; ++begin; } + } - } else { - std::copy(From::begin(s), From::end(s), To::back_inserter(result)); + // remainder < 8 bytes + while (begin != end) { + back_inserter = *begin; + ++begin; } return result; } + // From and To are facets + template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true> + typename To::string_type convert(const typename From::string_type& s) + { + if constexpr(sizeof(typename From::value_type) == 1 && sizeof(typename To::value_type) == 1 && std::is_same_v<From, UTF_8> && std::is_same_v<To, UTF_8>) { + if (validate_utf8<typename From::value_type>(s)) { + if constexpr (std::is_same_v<typename From::value_type, typename To::value_type>) + return s; + else + return typename To::string_type{s.begin(), s.end()}; + } else { + throw std::invalid_argument("Invalid UTF-8"); + } + } if constexpr(accu_size == 4 || accu_size == 8) { + return convert_optimized<From, To>(s); + } else { + typename To::string_type result; + std::copy(From::begin(s), From::end(s), To::back_inserter(result)); + return result; + } + } + // Helper to get correct Facet from char type, e.g. Encoding<typename decltype(s)::value_type>::Facet template<typename T> struct Encoding |