diff options
author | Roland Reichwein <mail@reichwein.it> | 2022-01-01 21:02:15 +0100 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2022-01-01 21:02:15 +0100 |
commit | c969cddf87a2c6d2eb74353f3115a70d166136e5 (patch) | |
tree | 2f1aa414cd37a41de064faf6e4121107648d66b2 | |
parent | 52d4375b10d920a59f1309c272a2e525feb1c25d (diff) |
Use own type traits
-rw-r--r-- | Makefile | 6 | ||||
-rw-r--r-- | include/unicode.h | 30 | ||||
-rw-r--r-- | include/unicode/type_traits.h | 2 | ||||
-rw-r--r-- | include/unicode/utf.h | 29 | ||||
-rw-r--r-- | include/unicode/utf_fwd.h | 23 |
5 files changed, 61 insertions, 29 deletions
@@ -139,7 +139,13 @@ DISTFILES= \ src/file.h \ Makefile \ include/unicode.h \ + include/unicode/endian.h \ + include/unicode/iso.h \ + include/unicode/predicate.h \ + include/unicode/types.h \ include/unicode/type_traits.h \ + include/unicode/utf.h \ + include/unicode/utf_fwd.h \ debian/control \ debian/compat \ debian/copyright \ diff --git a/include/unicode.h b/include/unicode.h index a50f525..eb872ec 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -77,15 +77,15 @@ namespace unicode { { if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), accu_size / sizeof(input_value_type)); - } else if constexpr(sizeof(input_value_type) == 1) { + } else if constexpr(is_utf_8_v<input_value_type>) { s.append({static_cast<typename output_string_type::value_type>(addr[0]), static_cast<typename output_string_type::value_type>(addr[1]), static_cast<typename output_string_type::value_type>(addr[2]), static_cast<typename output_string_type::value_type>(addr[3])}); - } else if constexpr(sizeof(input_value_type) == 2) { + } else if constexpr(is_utf_16_v<input_value_type>) { s.append({static_cast<typename output_string_type::value_type>(addr[0]), static_cast<typename output_string_type::value_type>(addr[1])}); - } else if constexpr(sizeof(input_value_type) == 4) { + } else if constexpr(is_utf_32_v<input_value_type>) { s.append({static_cast<typename output_string_type::value_type>(addr[0])}); } } @@ -105,7 +105,7 @@ namespace unicode { { if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), accu_size / sizeof(input_value_type)); - } else if constexpr(sizeof(input_value_type) == 1) { + } else if constexpr(is_utf_8_v<input_value_type>) { s.append({static_cast<typename output_string_type::value_type>(addr[0]), static_cast<typename output_string_type::value_type>(addr[1]), static_cast<typename output_string_type::value_type>(addr[2]), @@ -114,12 +114,12 @@ namespace unicode { static_cast<typename output_string_type::value_type>(addr[5]), static_cast<typename output_string_type::value_type>(addr[6]), static_cast<typename output_string_type::value_type>(addr[7])}); - } else if constexpr(sizeof(input_value_type) == 2) { + } else if constexpr(is_utf_16_v<input_value_type>) { s.append({static_cast<typename output_string_type::value_type>(addr[0]), static_cast<typename output_string_type::value_type>(addr[1]), static_cast<typename output_string_type::value_type>(addr[2]), static_cast<typename output_string_type::value_type>(addr[3])}); - } else if constexpr(sizeof(input_value_type) == 4) { + } else if constexpr(is_utf_32_v<input_value_type>) { s.append({static_cast<typename output_string_type::value_type>(addr[0]), static_cast<typename output_string_type::value_type>(addr[1])}); } @@ -174,7 +174,7 @@ namespace unicode { return result; } - template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<(sizeof(To) == 1), bool> = true> + template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<is_utf_8_v<To>, bool> = true> inline void append_utf(std::basic_string<To>& result, const char32_t& value) { using From = char32_t; @@ -190,7 +190,7 @@ namespace unicode { } } - template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<(sizeof(To) == 2), bool> = true> + template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<is_utf_16_v<To>, bool> = true> inline void append_utf(std::basic_string<To>& result, const char32_t& value) { if (bits_to_compare <= 16 || value <= 0xFFFF) { // expect value to be already valid Unicode values @@ -201,7 +201,7 @@ namespace unicode { } } - template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<(sizeof(To) == 4), bool> = true> + template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<is_utf_32_v<To>, bool> = true> inline void append_utf(std::basic_string<To>& result, const char32_t& value) { // expect value to be already valid Unicode values (checked in input iterator) @@ -211,7 +211,7 @@ namespace unicode { // Little Endian optimized version for UTF-8 // In block_mode, at least 4 bytes are in accu. On first call, even 8. // otherwise, at least one code unit is in accu - template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<(sizeof(From) == 1), bool> = true> + template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<is_utf_8_v<From>, bool> = true> inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu) { #if 1 @@ -265,7 +265,7 @@ namespace unicode { // Little Endian optimized version for UTF-16 // In block_mode, at least 4 bytes are in accu. On first call, even 8. // otherwise, at least one code unit is in accu - template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<(sizeof(From) == 2), bool> = true> + template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<is_utf_16_v<From>, bool> = true> inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu) { #if 1 @@ -282,7 +282,7 @@ namespace unicode { if ((accu & 0xFC00FC00FC00FC00) == 0xDC00D800DC00D800) { // found 4 code units forming 3 code points in UTF-16; // by definition of UTF-16, we have valid unicode values at this point - if constexpr(sizeof(To) == 4) { + if constexpr(is_utf_32_v<To>) { //result.resize(result.size() + 2); //*reinterpret_cast<uint64_t*>(&result[result.size() - 2]) = (((accu & 0x03FF000003FF) << 10) | ((accu >> 16) & 0x03FF000003FF)) + 0x0001000000010000; result.append({ @@ -316,7 +316,7 @@ namespace unicode { typename To::string_type convert_optimized_utf(const typename From::string_type& s) { typename To::string_type result; - if constexpr(sizeof(typename From::value_type) == 4) { + if constexpr(is_utf_32_v<typename From::value_type>) { for (const auto value: s) { if (is_valid_unicode(value)) append_utf(result, value); @@ -324,7 +324,7 @@ namespace unicode { throw std::invalid_argument("Invalid Unicode character in UTF-32"); } #if 0 - } else if constexpr(sizeof(typename From::value_type) == 2) { + } else if constexpr(is_utf_16_v<typename From::value_type>) { for (int i = 0; i < s.size(); i++) { typename From::value_type unit0{s[i]}; if (is_valid_unicode(unit0)) { @@ -388,7 +388,7 @@ namespace unicode { } else { throw std::invalid_argument("Invalid UTF input"); } - } else if constexpr(accu_size == 8 && is_little_endian() && sizeof(typename From::value_type) == 1 && + } else if constexpr(accu_size == 8 && is_little_endian() && is_utf_8_v<typename From::value_type> && is_utf_encoding_v<From> && is_utf_encoding_v<To>) { // endian specific optimization return convert_optimized_utf<From, To>(s); } else if constexpr(accu_size == 4 || accu_size == 8) { // accu size specific optimization with speedup for 7bit input diff --git a/include/unicode/type_traits.h b/include/unicode/type_traits.h index 3ee1d82..c3507e7 100644 --- a/include/unicode/type_traits.h +++ b/include/unicode/type_traits.h @@ -1,6 +1,6 @@ #pragma once -#include "utf.h" +#include "utf_fwd.h" #include <string> #include <type_traits> diff --git a/include/unicode/utf.h b/include/unicode/utf.h index dd504a7..81e8f2b 100644 --- a/include/unicode/utf.h +++ b/include/unicode/utf.h @@ -1,5 +1,8 @@ #pragma once +#include "utf_fwd.h" +#include "type_traits.h" + #include <list> #include <string> #include <stdexcept> @@ -37,7 +40,7 @@ namespace unicode::detail { (... && is_utf8_followup_byte(bytes)); // left fold for linear evaluation from left to right } - template<typename T, typename std::enable_if_t<(sizeof(T) == 1), bool> = true> + template<typename T, typename std::enable_if_t<is_utf_8_v<T>, bool> = true> inline bool validate_utf(const std::basic_string<T>& s) { int i{}; @@ -78,7 +81,7 @@ namespace unicode::detail { } } - template<typename T, typename std::enable_if_t<(sizeof(T) == 2), bool> = true> + template<typename T, typename std::enable_if_t<is_utf_16_v<T>, bool> = true> inline bool validate_utf(const std::basic_string<T>& s) { int i{}; @@ -95,7 +98,7 @@ namespace unicode::detail { return true; } - template<typename T, typename std::enable_if_t<(sizeof(T) == 4), bool> = true> + template<typename T, typename std::enable_if_t<is_utf_32_v<T>, bool> = true> inline bool validate_utf(const std::basic_string<T>& s) { for (auto i: s) @@ -135,10 +138,10 @@ namespace unicode::detail { return decode_utf8_leading_byte<sequence_length>(b) | decode_utf8_followup_byte(bytes...); } - template<typename T, typename Container=std::basic_string<T>> + template<typename T, typename Container> struct utf_iterator { - static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); + static_assert(is_utf_8_v<T> || is_utf_16_v<T> || is_utf_32_v<T>); typedef T value_type; typedef char32_t internal_type; @@ -199,13 +202,13 @@ namespace unicode::detail { } } - template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true> + template<class X = value_type, typename std::enable_if_t<is_utf_8_v<X>, bool> = true> inline internal_type calculate_value() { return calculate_utf8_value(static_cast<utf8_t>(get_code_unit<0>())); } - template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true> + template<class X = value_type, typename std::enable_if_t<is_utf_16_v<X>, bool> = true> inline internal_type calculate_value() { char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())}; @@ -226,7 +229,7 @@ namespace unicode::detail { } } - template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true> + template<class X = value_type, typename std::enable_if_t<is_utf_32_v<X>, bool> = true> inline internal_type calculate_value() { internal_type result {static_cast<internal_type>(get_code_unit<0>())}; @@ -296,10 +299,10 @@ namespace unicode::detail { return utf8_trailing_byte<m - n - 1, From, To>(value); } - template<typename T, typename Container=std::basic_string<T>> + template<typename T, typename Container> struct utf_back_insert_iterator { - static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); + static_assert(is_utf_8_v<T> || is_utf_16_v<T> || is_utf_32_v<T>); typedef T value_type; typedef char32_t internal_type; @@ -341,7 +344,7 @@ namespace unicode::detail { } } - template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true> + template<class X = value_type, typename std::enable_if_t<is_utf_8_v<X>, bool> = true> inline void append_utf(const internal_type& value) { using Y = internal_type; @@ -357,7 +360,7 @@ namespace unicode::detail { } } - template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true> + template<class X = value_type, typename std::enable_if_t<is_utf_16_v<X>, bool> = true> inline void append_utf(const internal_type& value) { if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator) @@ -368,7 +371,7 @@ namespace unicode::detail { } } - template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true> + template<class X = value_type, typename std::enable_if_t<is_utf_32_v<X>, bool> = true> inline void append_utf(const internal_type& value) { // expect value to be already valid Unicode values (checked in input iterator) diff --git a/include/unicode/utf_fwd.h b/include/unicode/utf_fwd.h new file mode 100644 index 0000000..f3f6c52 --- /dev/null +++ b/include/unicode/utf_fwd.h @@ -0,0 +1,23 @@ +#pragma once + +// Forward declarations + +#include <string> + +namespace unicode::detail { + + template<typename T, typename Container=std::basic_string<T>> + struct utf_iterator; + + template<typename T, typename Container=std::basic_string<T>> + struct utf_back_insert_iterator; + +} // namespace unicode::detail + +namespace unicode { + + template<typename InputIt, typename OutputIt> + struct UTF; + +} // namespace unicode + |