diff options
author | Roland Reichwein <mail@reichwein.it> | 2021-12-25 14:38:46 +0100 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2021-12-25 14:38:46 +0100 |
commit | 79dc9edc72c5b9fefb129fe36029d4781b1e969c (patch) | |
tree | 9e5ff95ef84ab089c652935ae8f94758318b6dbc /include/unicode.h | |
parent | 98f9132997353bb3e750e8e2db99ebd474a8dbb6 (diff) |
Generalized type usage and optimizations
Diffstat (limited to 'include/unicode.h')
-rw-r--r-- | include/unicode.h | 155 |
1 files changed, 102 insertions, 53 deletions
diff --git a/include/unicode.h b/include/unicode.h index 8dedb19..c2d727a 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -45,8 +45,8 @@ namespace unicode::detail { { static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); - typedef T input_type; - typedef char32_t value_type; + typedef T value_type; + typedef char32_t internal_type; typedef char32_t& reference; typedef char32_t* pointer; typedef size_t difference_type; @@ -67,9 +67,9 @@ namespace unicode::detail { } template<size_t index> - T get_code_unit() const noexcept + value_type get_code_unit() const noexcept { - if constexpr (std::is_same<Container, typename std::list<T>>::value) { + if constexpr (std::is_same<Container, typename std::list<value_type>>::value) { // std::list doesn't support it + n auto it{iterator}; std::advance(it, index); @@ -79,46 +79,46 @@ namespace unicode::detail { } } - inline static bool is_continuation_byte(T b) noexcept + inline static bool is_continuation_byte(value_type b) noexcept { return (b & 0b11000000) == 0b10000000; } template<typename... Targs> - inline static bool is_continuation_byte(T b, Targs... Fargs) noexcept + inline static bool is_continuation_byte(value_type b, Targs... Fargs) noexcept { return is_continuation_byte(b) && is_continuation_byte(Fargs...); } template<size_t n> - inline static bool is_byte0_of(T b) noexcept + inline static bool is_byte0_of(value_type b) noexcept { - return (b & static_cast<T>(0xFF << (7 - n))) == static_cast<T>(0xFF << (8 - n)); + return (b & static_cast<value_type>(0xFF << (7 - n))) == static_cast<value_type>(0xFF << (8 - n)); } - inline static char32_t continuation_value(T b) noexcept + inline static internal_type continuation_value(value_type b) noexcept { - return static_cast<char32_t>(b & 0b00111111); + return static_cast<internal_type>(b & 0b00111111); } template<typename... Targs> - inline static char32_t continuation_value(T b, Targs... Fargs) noexcept + inline static internal_type continuation_value(value_type b, Targs... Fargs) noexcept { return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...); } template<size_t n> - inline static char32_t value_byte0_of(T b) noexcept + inline static internal_type value_byte0_of(value_type b) noexcept { return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6); } - template<class X = T, typename std::enable_if<(sizeof(X) == 1), bool>::type = true> - inline value_type calculate_value() + template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true> + inline internal_type calculate_value() { utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())}; if (byte0 & 0x80) { // 2-4 bytes - value_type value{}; + internal_type value{}; if (size_t remaining{remaining_code_units()}; remaining >= 2) { utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())}; if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes @@ -154,8 +154,8 @@ namespace unicode::detail { } } - template<class X = T, typename std::enable_if<(sizeof(X) == 2), bool>::type = true> - inline value_type calculate_value() + template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true> + inline internal_type calculate_value() { char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())}; @@ -175,10 +175,10 @@ namespace unicode::detail { } } - template<class X = T, typename std::enable_if<(sizeof(X) == 4), bool>::type = true> - inline value_type calculate_value() + template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true> + inline internal_type calculate_value() { - value_type result {static_cast<char32_t>(get_code_unit<0>())}; + internal_type result {static_cast<internal_type>(get_code_unit<0>())}; if (!unicode::is_valid_unicode(result)) throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result))); @@ -199,7 +199,7 @@ namespace unicode::detail { return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator); } - value_type operator*() + internal_type operator*() { return calculate_value(); } @@ -256,14 +256,14 @@ namespace unicode::detail { // n is number of UTF-8 bytes in sequence template<size_t n> - inline static T byte0_of(char32_t value) + inline static value_type byte0_of(char32_t value) { return (value >> 6 * (n - 1)) | (0xFF << (8 - n)); } // n is index of 6-bit groups, counting from bit 0 template<size_t n> - inline static T trailing_byte(char32_t value) + inline static value_type trailing_byte(char32_t value) { return ((value >> n * 6) & 0b111111) | 0b10000000; } @@ -271,7 +271,7 @@ namespace unicode::detail { // calculate UTF-8 sequence byte for m >= 2 bytes sequences (i.e. non-ASCII) // assume value to be valid Unicode value for given byte position template<size_t n, size_t m> - inline static T byte_n_of_m(char32_t value) + inline static value_type byte_n_of_m(char32_t value) { if constexpr (n == 0) return byte0_of<m>(value); @@ -282,7 +282,7 @@ namespace unicode::detail { template<typename Arg> inline void append(Arg&& arg) { - if constexpr (std::is_same<Container, typename std::basic_string<T>>::value) { + if constexpr (std::is_same<Container, typename std::basic_string<value_type>>::value) { s.append({arg}); } else { s.emplace_back(arg); @@ -292,7 +292,7 @@ namespace unicode::detail { template<typename Arg, typename... Args> inline void append(Arg&& arg, Args&&... args) { - if constexpr (std::is_same<Container, typename std::basic_string<T>>::value) { + if constexpr (std::is_same<Container, typename std::basic_string<value_type>>::value) { s.append({arg, args...}); } else { s.emplace_back(arg); @@ -300,7 +300,7 @@ namespace unicode::detail { } } - template<class X = T, typename std::enable_if<(sizeof(X) == 1), bool>::type = true> + template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true> inline void append_utf(const char32_t& value) { if (value < 0x80) { // 1 byte @@ -315,18 +315,18 @@ namespace unicode::detail { throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast<uint32_t>(value))); } - template<class X = T, typename std::enable_if<(sizeof(X) == 2), bool>::type = true> + template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true> inline void append_utf(const char32_t& value) { if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator) append(static_cast<value_type>(value)); } else { char32_t value_reduced{value - 0x10000}; - append(static_cast<T>((value_reduced >> 10) + 0xD800), static_cast<T>((value_reduced & 0x3FF) + 0xDC00)); + append(static_cast<value_type>((value_reduced >> 10) + 0xD800), static_cast<value_type>((value_reduced & 0x3FF) + 0xDC00)); } } - template<class X = T, typename std::enable_if<(sizeof(X) == 4), bool>::type = true> + template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true> inline void append_utf(const char32_t& value) { // expect value to be already valid Unicode values (checked in input iterator) @@ -382,8 +382,8 @@ namespace unicode { template<unicode::detail::iso_map_type& Map=iso_8859_1_map, typename Container=std::basic_string<iso_t>> struct iso_iterator { - typedef iso_t input_type; - typedef char32_t value_type; + typedef iso_t value_type; + typedef char32_t internal_type; typedef char32_t& reference; typedef char32_t* pointer; typedef size_t difference_type; @@ -406,9 +406,9 @@ namespace unicode { } // return reference? - value_type operator*() const + internal_type operator*() const { - input_type value{*m_it}; + value_type value{*m_it}; if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 needed { @@ -416,7 +416,7 @@ namespace unicode { if (it != Map.end()) return it->second; } - return static_cast<value_type>(static_cast<uint8_t>(value)); + return static_cast<internal_type>(static_cast<uint8_t>(value)); } iso_iterator& operator+=(size_t distance) @@ -554,28 +554,61 @@ namespace unicode { template<> struct ConvertInputOptimizer<1> { static const uint32_t ascii_mask { 0x80808080 }; + // 00112233 + // 00112222 + // 00111122 + // 00111111 + // 00001122 + // 00001111 + // 00000011 }; - template<int value_size> - struct ConvertOutputOptimizer {}; + template<> struct ConvertInputOptimizer<2> + { + static const uint32_t ascii_mask { 0xFF80FF80 }; + }; + + template<> struct ConvertInputOptimizer<4> + { + static const uint32_t ascii_mask { 0xFFFFFF80 }; + }; + + template<int AccuSize, class ConvertInputOptimizer> + struct ArchitectureOptimizer {}; - template<> struct ConvertOutputOptimizer<1> + template<class ConvertInputOptimizer> + struct ArchitectureOptimizer<4, ConvertInputOptimizer> { - template<typename input_value_type, class output_string_type, int code_units> + typedef ConvertInputOptimizer input_optimizer; + typedef uint32_t accu_type; + static const size_t accu_size {4}; + static const accu_type addr_mask {accu_size - 1}; + static const accu_type ascii_mask { (accu_type)input_optimizer::ascii_mask }; + static const accu_type ascii_value { 0ULL }; + + template<typename input_value_type, class output_string_type> inline static void append(const input_value_type* addr, output_string_type& s) { - s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), code_units); + if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { + s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), accu_size / sizeof(input_value_type)); + } else if constexpr(sizeof(input_value_type) == 1) { + s.append({static_cast<typename output_string_type::value_type>(addr[0]), + static_cast<typename output_string_type::value_type>(addr[1]), + static_cast<typename output_string_type::value_type>(addr[2]), + static_cast<typename output_string_type::value_type>(addr[3])}); + } else if constexpr(sizeof(input_value_type) == 2) { + s.append({static_cast<typename output_string_type::value_type>(addr[0]), + static_cast<typename output_string_type::value_type>(addr[1])}); + } else if constexpr(sizeof(input_value_type) == 4) { + s.append({static_cast<typename output_string_type::value_type>(addr[0])}); + } } }; - - template<int AccuSize, class ConvertInputOptimizer, class ConvertOutputOptimizer> - struct ArchitectureOptimizer {}; - template<class ConvertInputOptimizer, class ConvertOutputOptimizer> - struct ArchitectureOptimizer<8, ConvertInputOptimizer, ConvertOutputOptimizer> + template<class ConvertInputOptimizer> + struct ArchitectureOptimizer<8, ConvertInputOptimizer> { typedef ConvertInputOptimizer input_optimizer; - typedef ConvertOutputOptimizer output_optimizer; typedef uint64_t accu_type; static const size_t accu_size {8}; static const accu_type addr_mask {accu_size - 1}; @@ -585,7 +618,26 @@ namespace unicode { template<typename input_value_type, class output_string_type> inline static void append(const input_value_type* addr, output_string_type& s) { - output_optimizer::template append<input_value_type, output_string_type, accu_size>(addr, s); + if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { + s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), accu_size / sizeof(input_value_type)); + } else if constexpr(sizeof(input_value_type) == 1) { + s.append({static_cast<typename output_string_type::value_type>(addr[0]), + static_cast<typename output_string_type::value_type>(addr[1]), + static_cast<typename output_string_type::value_type>(addr[2]), + static_cast<typename output_string_type::value_type>(addr[3]), + static_cast<typename output_string_type::value_type>(addr[4]), + static_cast<typename output_string_type::value_type>(addr[5]), + static_cast<typename output_string_type::value_type>(addr[6]), + static_cast<typename output_string_type::value_type>(addr[7])}); + } else if constexpr(sizeof(input_value_type) == 2) { + s.append({static_cast<typename output_string_type::value_type>(addr[0]), + static_cast<typename output_string_type::value_type>(addr[1]), + static_cast<typename output_string_type::value_type>(addr[2]), + static_cast<typename output_string_type::value_type>(addr[3])}); + } else if constexpr(sizeof(input_value_type) == 4) { + s.append({static_cast<typename output_string_type::value_type>(addr[0]), + static_cast<typename output_string_type::value_type>(addr[1])}); + } } }; @@ -595,12 +647,9 @@ namespace unicode { { typename To::string_type result; - if constexpr(sizeof(typename From::string_type::value_type) == 1 && - sizeof(typename To::value_type) == 1 && - sizeof(size_t) >= 8) { + if constexpr(sizeof(size_t) == 4 || sizeof(size_t) == 8) { typedef ConvertInputOptimizer<sizeof(typename From::string_type::value_type)> input_optimizer; - typedef ConvertOutputOptimizer<sizeof(typename To::value_type)> output_optimizer; - typedef ArchitectureOptimizer<sizeof(size_t), input_optimizer, output_optimizer> arch_optimizer; + typedef ArchitectureOptimizer<sizeof(size_t), input_optimizer> arch_optimizer; auto begin{From::begin(s)}; auto end{From::end(s)}; @@ -612,7 +661,7 @@ namespace unicode { typename arch_optimizer::accu_type data{*addr}; if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) { arch_optimizer::template append<typename From::string_type::value_type, typename To::string_type>(reinterpret_cast<const typename From::string_type::value_type*>(addr), result); - begin += arch_optimizer::accu_size; + begin += arch_optimizer::accu_size / sizeof(typename From::string_type::value_type); ++addr; } else { // just advance one code unit for now |