diff options
author | Roland Reichwein <mail@reichwein.it> | 2022-01-02 15:02:59 +0100 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2022-01-02 15:02:59 +0100 |
commit | ac045216d6e7fcb0ec4d2169ac2b6dffbe21707a (patch) | |
tree | 1403ee320ad364ca9f0e15e8f64cf2a864ef0b4a /include/unicode.h | |
parent | c969cddf87a2c6d2eb74353f3115a70d166136e5 (diff) |
Remove dead code from optimizations
Diffstat (limited to 'include/unicode.h')
-rw-r--r-- | include/unicode.h | 129 |
1 files changed, 25 insertions, 104 deletions
diff --git a/include/unicode.h b/include/unicode.h index eb872ec..2bf17f4 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -214,7 +214,6 @@ namespace unicode { template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<is_utf_8_v<From>, bool> = true> inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu) { -#if 1 if (block_mode && bytes_in_accu == 8 && (accu & 0x8080808080808080) == 0) { result.append({ static_cast<To>(accu & 0x7F), @@ -229,7 +228,6 @@ namespace unicode { accu = 0; bytes_in_accu = 0; } else -#endif if ((accu & 0x80) == 0) { // 1 byte sequence append_utf<7>(result, static_cast<char32_t>(accu & 0x7F)); accu >>= 8; @@ -262,116 +260,39 @@ namespace unicode { throw std::invalid_argument("Invalid UTF-8 byte sequence"); } - // Little Endian optimized version for UTF-16 - // In block_mode, at least 4 bytes are in accu. On first call, even 8. - // otherwise, at least one code unit is in accu - template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<is_utf_16_v<From>, bool> = true> - inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu) - { -#if 1 - if ((accu & 0xFF80FF80FF80FF80) == 0) { - auto number_of_values{bytes_in_accu / sizeof(From)}; - result.resize(result.size() + number_of_values); - for (int i = 0; i < number_of_values; i++) { - result[result.size() - number_of_values + i] = static_cast<To>(accu & 0x7F); - accu >>= 16; - } - bytes_in_accu = 0; - } else -#endif - if ((accu & 0xFC00FC00FC00FC00) == 0xDC00D800DC00D800) { - // found 4 code units forming 3 code points in UTF-16; - // by definition of UTF-16, we have valid unicode values at this point - if constexpr(is_utf_32_v<To>) { - //result.resize(result.size() + 2); - //*reinterpret_cast<uint64_t*>(&result[result.size() - 2]) = (((accu & 0x03FF000003FF) << 10) | ((accu >> 16) & 0x03FF000003FF)) + 0x0001000000010000; - result.append({ - static_cast<To>(((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000), - static_cast<To>(((accu & 0x03FF00000000) >> 22 | ((accu >> 48) & 0x03FF)) + 0x10000) - }); - } else { - append_utf(result, ((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000); - append_utf(result, ((accu & 0x03FF00000000) >> 22 | ((accu >> 48) & 0x03FF)) + 0x10000); - } - accu = 0; - bytes_in_accu = 0; - } else - if (From unit0 {static_cast<From>(accu & 0xFFFF)}; is_valid_unicode<16>(unit0)) { - append_utf<16>(result, unit0); - accu >>= 16; - bytes_in_accu -= 2; - } else - if ((accu & 0xFC00FC00) == 0xDC00D800) { - // found 2 code units forming 1 code point in UTF-16; - // by definition of UTF-16, we have a valid unicode value at this point - append_utf(result, ((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000); - accu >>= 32; - bytes_in_accu -= 4; - } else - throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); - } - // Little Endian optimized version template<typename From, typename To, std::enable_if_t<is_encoding_v<From> && is_encoding_v<To>, bool> = true> typename To::string_type convert_optimized_utf(const typename From::string_type& s) { typename To::string_type result; - if constexpr(is_utf_32_v<typename From::value_type>) { - for (const auto value: s) { - if (is_valid_unicode(value)) - append_utf(result, value); - else - throw std::invalid_argument("Invalid Unicode character in UTF-32"); + uint64_t accu{}; + int bytes_in_accu{}; + + size_t s_index{}; + size_t s_size{s.size()}; + while (s_index + 8 / sizeof(typename From::value_type) <= s_size) { + // read input + // assume: bytes_in_accu < 8 + accu |= (*reinterpret_cast<const uint64_t*>(&(s.data()[s_index]))) << (bytes_in_accu * 8); + s_index += (8 - bytes_in_accu) / sizeof(typename From::value_type); + bytes_in_accu = 8; + + while (bytes_in_accu >= 4) { + append_accu<typename From::value_type, typename To::value_type, true>(result, accu, bytes_in_accu); } -#if 0 - } else if constexpr(is_utf_16_v<typename From::value_type>) { - for (int i = 0; i < s.size(); i++) { - typename From::value_type unit0{s[i]}; - if (is_valid_unicode(unit0)) { - append_utf(result, unit0); - } else { - i++; - if (i < s.size()) { - typename From::value_type unit1 {s[i]}; - if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) - throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); - - append_utf(result, (static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000); - } else - throw std::invalid_argument("Invalid code unit at end of UTF-16 string"); - } - } -#endif - } else { - uint64_t accu{}; - int bytes_in_accu{}; - - size_t s_index{}; - size_t s_size{s.size()}; - while (s_index + 8 / sizeof(typename From::value_type) <= s_size) { - // read input - // assume: bytes_in_accu < 8 - accu |= (*reinterpret_cast<const uint64_t*>(&(s.data()[s_index]))) << (bytes_in_accu * 8); - s_index += (8 - bytes_in_accu) / sizeof(typename From::value_type); - bytes_in_accu = 8; - - while (bytes_in_accu >= 4) { - append_accu<typename From::value_type, typename To::value_type, true>(result, accu, bytes_in_accu); - } - } - - // 0..3 bytes left in accu - // 0..7 bytes left in s + } - while (s_index < s_size || bytes_in_accu > 0) { - while (s_index < s_size && bytes_in_accu < 8) { - accu |= static_cast<uint64_t>(*reinterpret_cast<const typename From::value_type*>(&(s.data()[s_index]))) << (bytes_in_accu * 8); - ++s_index; - bytes_in_accu += sizeof(typename From::value_type); - } + // 0..3 bytes left in accu + // 0..7 bytes left in s - append_accu<typename From::value_type, typename To::value_type, false>(result, accu, bytes_in_accu); + while (s_index < s_size || bytes_in_accu > 0) { + while (s_index < s_size && bytes_in_accu < 8) { + accu |= static_cast<uint64_t>(*reinterpret_cast<const typename From::value_type*>(&(s.data()[s_index]))) << (bytes_in_accu * 8); + ++s_index; + bytes_in_accu += sizeof(typename From::value_type); } + + append_accu<typename From::value_type, typename To::value_type, false>(result, accu, bytes_in_accu); } return result; } @@ -408,7 +329,7 @@ namespace unicode { ToContainer convert(const FromContainer& s) { typedef UTF<utf_iterator<From>, utf_back_insert_iterator<To>> UTF_Trait; - + ToContainer result; std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); |