summaryrefslogtreecommitdiffhomepage
path: root/include/unicode.h
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2022-01-02 15:02:59 +0100
committerRoland Reichwein <mail@reichwein.it>2022-01-02 15:02:59 +0100
commitac045216d6e7fcb0ec4d2169ac2b6dffbe21707a (patch)
tree1403ee320ad364ca9f0e15e8f64cf2a864ef0b4a /include/unicode.h
parentc969cddf87a2c6d2eb74353f3115a70d166136e5 (diff)
Remove dead code from optimizations
Diffstat (limited to 'include/unicode.h')
-rw-r--r--include/unicode.h129
1 files changed, 25 insertions, 104 deletions
diff --git a/include/unicode.h b/include/unicode.h
index eb872ec..2bf17f4 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -214,7 +214,6 @@ namespace unicode {
template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<is_utf_8_v<From>, bool> = true>
inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu)
{
-#if 1
if (block_mode && bytes_in_accu == 8 && (accu & 0x8080808080808080) == 0) {
result.append({
static_cast<To>(accu & 0x7F),
@@ -229,7 +228,6 @@ namespace unicode {
accu = 0;
bytes_in_accu = 0;
} else
-#endif
if ((accu & 0x80) == 0) { // 1 byte sequence
append_utf<7>(result, static_cast<char32_t>(accu & 0x7F));
accu >>= 8;
@@ -262,116 +260,39 @@ namespace unicode {
throw std::invalid_argument("Invalid UTF-8 byte sequence");
}
- // Little Endian optimized version for UTF-16
- // In block_mode, at least 4 bytes are in accu. On first call, even 8.
- // otherwise, at least one code unit is in accu
- template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<is_utf_16_v<From>, bool> = true>
- inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu)
- {
-#if 1
- if ((accu & 0xFF80FF80FF80FF80) == 0) {
- auto number_of_values{bytes_in_accu / sizeof(From)};
- result.resize(result.size() + number_of_values);
- for (int i = 0; i < number_of_values; i++) {
- result[result.size() - number_of_values + i] = static_cast<To>(accu & 0x7F);
- accu >>= 16;
- }
- bytes_in_accu = 0;
- } else
-#endif
- if ((accu & 0xFC00FC00FC00FC00) == 0xDC00D800DC00D800) {
- // found 4 code units forming 3 code points in UTF-16;
- // by definition of UTF-16, we have valid unicode values at this point
- if constexpr(is_utf_32_v<To>) {
- //result.resize(result.size() + 2);
- //*reinterpret_cast<uint64_t*>(&result[result.size() - 2]) = (((accu & 0x03FF000003FF) << 10) | ((accu >> 16) & 0x03FF000003FF)) + 0x0001000000010000;
- result.append({
- static_cast<To>(((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000),
- static_cast<To>(((accu & 0x03FF00000000) >> 22 | ((accu >> 48) & 0x03FF)) + 0x10000)
- });
- } else {
- append_utf(result, ((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000);
- append_utf(result, ((accu & 0x03FF00000000) >> 22 | ((accu >> 48) & 0x03FF)) + 0x10000);
- }
- accu = 0;
- bytes_in_accu = 0;
- } else
- if (From unit0 {static_cast<From>(accu & 0xFFFF)}; is_valid_unicode<16>(unit0)) {
- append_utf<16>(result, unit0);
- accu >>= 16;
- bytes_in_accu -= 2;
- } else
- if ((accu & 0xFC00FC00) == 0xDC00D800) {
- // found 2 code units forming 1 code point in UTF-16;
- // by definition of UTF-16, we have a valid unicode value at this point
- append_utf(result, ((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000);
- accu >>= 32;
- bytes_in_accu -= 4;
- } else
- throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing");
- }
-
// Little Endian optimized version
template<typename From, typename To, std::enable_if_t<is_encoding_v<From> && is_encoding_v<To>, bool> = true>
typename To::string_type convert_optimized_utf(const typename From::string_type& s)
{
typename To::string_type result;
- if constexpr(is_utf_32_v<typename From::value_type>) {
- for (const auto value: s) {
- if (is_valid_unicode(value))
- append_utf(result, value);
- else
- throw std::invalid_argument("Invalid Unicode character in UTF-32");
+ uint64_t accu{};
+ int bytes_in_accu{};
+
+ size_t s_index{};
+ size_t s_size{s.size()};
+ while (s_index + 8 / sizeof(typename From::value_type) <= s_size) {
+ // read input
+ // assume: bytes_in_accu < 8
+ accu |= (*reinterpret_cast<const uint64_t*>(&(s.data()[s_index]))) << (bytes_in_accu * 8);
+ s_index += (8 - bytes_in_accu) / sizeof(typename From::value_type);
+ bytes_in_accu = 8;
+
+ while (bytes_in_accu >= 4) {
+ append_accu<typename From::value_type, typename To::value_type, true>(result, accu, bytes_in_accu);
}
-#if 0
- } else if constexpr(is_utf_16_v<typename From::value_type>) {
- for (int i = 0; i < s.size(); i++) {
- typename From::value_type unit0{s[i]};
- if (is_valid_unicode(unit0)) {
- append_utf(result, unit0);
- } else {
- i++;
- if (i < s.size()) {
- typename From::value_type unit1 {s[i]};
- if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00)
- throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates");
-
- append_utf(result, (static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000);
- } else
- throw std::invalid_argument("Invalid code unit at end of UTF-16 string");
- }
- }
-#endif
- } else {
- uint64_t accu{};
- int bytes_in_accu{};
-
- size_t s_index{};
- size_t s_size{s.size()};
- while (s_index + 8 / sizeof(typename From::value_type) <= s_size) {
- // read input
- // assume: bytes_in_accu < 8
- accu |= (*reinterpret_cast<const uint64_t*>(&(s.data()[s_index]))) << (bytes_in_accu * 8);
- s_index += (8 - bytes_in_accu) / sizeof(typename From::value_type);
- bytes_in_accu = 8;
-
- while (bytes_in_accu >= 4) {
- append_accu<typename From::value_type, typename To::value_type, true>(result, accu, bytes_in_accu);
- }
- }
-
- // 0..3 bytes left in accu
- // 0..7 bytes left in s
+ }
- while (s_index < s_size || bytes_in_accu > 0) {
- while (s_index < s_size && bytes_in_accu < 8) {
- accu |= static_cast<uint64_t>(*reinterpret_cast<const typename From::value_type*>(&(s.data()[s_index]))) << (bytes_in_accu * 8);
- ++s_index;
- bytes_in_accu += sizeof(typename From::value_type);
- }
+ // 0..3 bytes left in accu
+ // 0..7 bytes left in s
- append_accu<typename From::value_type, typename To::value_type, false>(result, accu, bytes_in_accu);
+ while (s_index < s_size || bytes_in_accu > 0) {
+ while (s_index < s_size && bytes_in_accu < 8) {
+ accu |= static_cast<uint64_t>(*reinterpret_cast<const typename From::value_type*>(&(s.data()[s_index]))) << (bytes_in_accu * 8);
+ ++s_index;
+ bytes_in_accu += sizeof(typename From::value_type);
}
+
+ append_accu<typename From::value_type, typename To::value_type, false>(result, accu, bytes_in_accu);
}
return result;
}
@@ -408,7 +329,7 @@ namespace unicode {
ToContainer convert(const FromContainer& s)
{
typedef UTF<utf_iterator<From>, utf_back_insert_iterator<To>> UTF_Trait;
-
+
ToContainer result;
std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result));