diff options
author | Roland Reichwein <mail@reichwein.it> | 2021-12-28 12:46:30 +0100 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2021-12-28 12:46:30 +0100 |
commit | 403c885d67f79c637ebcb303722adfd6a4b8195e (patch) | |
tree | d8f40c674a5c65176e028a1c7bb9122baa2e7756 /include | |
parent | 970ba4111160fbf78351b21a024c46c0978e0440 (diff) |
Optimize UTF validation
Diffstat (limited to 'include')
-rw-r--r-- | include/unicode.h | 95 |
1 files changed, 65 insertions, 30 deletions
diff --git a/include/unicode.h b/include/unicode.h index 4064233..be91d77 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -47,12 +47,6 @@ namespace unicode::detail { using namespace std::string_literals; - template<typename value_type> - inline bool is_utf8_followup_byte(value_type b) noexcept - { - return (b & 0b11000000) == 0b10000000; - } - template<size_t sequence_length, typename value_type> inline bool is_utf8_leading_byte(value_type byte) noexcept { @@ -65,22 +59,26 @@ namespace unicode::detail { } } + template<typename value_type> + inline bool is_utf8_followup_byte(value_type b) noexcept + { + return (b & 0b11000000) == 0b10000000; + } + template<typename value_type, typename... Tbytes> inline bool is_utf8_sequence(value_type byte0, Tbytes... bytes) noexcept { constexpr auto n{sizeof...(Tbytes) + 1}; - static_assert(n <= 4); + static_assert(n <= 4, "UTF-8 sequences of 1 through 4 code units are supported"); return is_utf8_leading_byte<n>(byte0) && - (is_utf8_followup_byte(bytes) && ...); + (... && is_utf8_followup_byte(bytes)); // left fold for linear evaluation from left to right } - template<typename T> - inline bool validate_utf8(const std::basic_string<T>& s) + template<typename T, typename std::enable_if_t<(sizeof(T) == 1), bool> = true> + inline bool validate_utf(const std::basic_string<T>& s) { - static_assert(sizeof(T) == 1); - int i{}; auto size{s.size()}; while (i < size) { @@ -103,6 +101,48 @@ namespace unicode::detail { return true; } + template<typename value_type, typename... Twords> + inline bool is_utf16_sequence(value_type word0, Twords... words) noexcept + { + constexpr auto n{sizeof...(Twords) + 1}; + + static_assert(n <= 2, "UTF-16 sequences of only 1 or 2 code units are supported"); + + if constexpr(n == 1) { + return is_valid_unicode(word0); + } else { + char16_t unit0 {static_cast<char16_t>(word0)}; + char16_t unit1 {static_cast<char16_t>((words, ...))}; + return (unit0 & 0xFC00) == 0xD800 && (unit1 & 0xFC00) == 0xDC00; + } + } + + template<typename T, typename std::enable_if_t<(sizeof(T) == 2), bool> = true> + inline bool validate_utf(const std::basic_string<T>& s) + { + int i{}; + auto size{s.size()}; + while (i < size) { + if (is_utf16_sequence(s[i])) { + i++; + } else if ((i < size - 1) && is_utf16_sequence(s[i], s[i + 1])) { + i += 2; + } else { + return false; + } + } + return true; + } + + template<typename T, typename std::enable_if_t<(sizeof(T) == 4), bool> = true> + inline bool validate_utf(const std::basic_string<T>& s) + { + for (auto i: s) + if (!is_valid_unicode(i)) + return false; + return true; + } + template<typename value_type> inline char32_t continuation_value(value_type b) noexcept { @@ -160,7 +200,7 @@ namespace unicode::detail { } } - template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true> + template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true> inline internal_type calculate_value() { utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())}; @@ -201,7 +241,7 @@ namespace unicode::detail { } } - template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true> + template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true> inline internal_type calculate_value() { char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())}; @@ -222,7 +262,7 @@ namespace unicode::detail { } } - template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true> + template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true> inline internal_type calculate_value() { internal_type result {static_cast<internal_type>(get_code_unit<0>())}; @@ -348,7 +388,7 @@ namespace unicode::detail { } } - template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true> + template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true> inline void append_utf(const internal_type& value) { if (value < 0x80) { // 1 byte @@ -363,7 +403,7 @@ namespace unicode::detail { throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast<uint32_t>(value))); } - template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true> + template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true> inline void append_utf(const internal_type& value) { if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator) @@ -374,7 +414,7 @@ namespace unicode::detail { } } - template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true> + template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true> inline void append_utf(const internal_type& value) { // expect value to be already valid Unicode values (checked in input iterator) @@ -741,12 +781,12 @@ namespace unicode { template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true> typename To::string_type convert(const typename From::string_type& s) { - if constexpr(sizeof(typename From::value_type) == 1 && sizeof(typename To::value_type) == 1 && std::is_same_v<From, UTF_8> && std::is_same_v<To, UTF_8>) { - if (validate_utf8<typename From::value_type>(s)) { - if constexpr (std::is_same_v<typename From::value_type, typename To::value_type>) - return s; - else - return typename To::string_type{s.begin(), s.end()}; + // if input type == output type, only validate and return input, is appropriate + if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) == 1 && + std::is_same_v<From, UTF<utf_iterator<typename From::value_type>, utf_back_insert_iterator<typename From::value_type>>> && + std::is_same_v<To, UTF<utf_iterator<typename To::value_type>, utf_back_insert_iterator<typename To::value_type>>>) { + if (validate_utf<typename From::value_type>(s)) { + return s; } else { throw std::invalid_argument("Invalid UTF-8"); } @@ -848,12 +888,7 @@ namespace unicode { template<typename Facet, std::enable_if_t<std::is_empty<Facet>::value, bool> = true> bool is_valid_utf(const typename Facet::string_type& s) { - try { - std::for_each(Facet::begin(s), Facet::end(s), [](const char32_t& c){}); - } catch (const std::invalid_argument&) { - return false; - } - return true; + return validate_utf<typename Facet::value_type>(s); } } // namespace unicode |