diff options
| -rw-r--r-- | include/unicode.h | 129 | ||||
| -rw-r--r-- | include/unicode/type_traits.h | 6 | ||||
| -rw-r--r-- | include/unicode/utf.h | 5 | ||||
| -rw-r--r-- | include/unicode/utf_fwd.h | 7 | 
4 files changed, 35 insertions, 112 deletions
| diff --git a/include/unicode.h b/include/unicode.h index eb872ec..2bf17f4 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -214,7 +214,6 @@ namespace unicode {   template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<is_utf_8_v<From>, bool> = true>   inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu)   { -#if 1    if (block_mode && bytes_in_accu == 8 && (accu & 0x8080808080808080) == 0) {     result.append({                   static_cast<To>(accu & 0x7F), @@ -229,7 +228,6 @@ namespace unicode {     accu = 0;     bytes_in_accu = 0;    } else -#endif    if ((accu & 0x80) == 0) { // 1 byte sequence     append_utf<7>(result, static_cast<char32_t>(accu & 0x7F));     accu >>= 8; @@ -262,116 +260,39 @@ namespace unicode {     throw std::invalid_argument("Invalid UTF-8 byte sequence");   } - // Little Endian optimized version for UTF-16 - // In block_mode, at least 4 bytes are in accu. On first call, even 8. - // otherwise, at least one code unit is in accu - template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<is_utf_16_v<From>, bool> = true> - inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu) - { -#if 1 -  if ((accu & 0xFF80FF80FF80FF80) == 0) { -   auto number_of_values{bytes_in_accu / sizeof(From)}; -   result.resize(result.size() + number_of_values); -   for (int i = 0; i < number_of_values; i++) { -    result[result.size() - number_of_values + i] = static_cast<To>(accu & 0x7F); -    accu >>= 16; -   } -   bytes_in_accu = 0; -  } else -#endif -  if ((accu & 0xFC00FC00FC00FC00) == 0xDC00D800DC00D800) { -   // found 4 code units forming 3 code points in UTF-16; -   // by definition of UTF-16, we have valid unicode values at this point -   if constexpr(is_utf_32_v<To>) { -    //result.resize(result.size() + 2); -    //*reinterpret_cast<uint64_t*>(&result[result.size() - 2]) = (((accu & 0x03FF000003FF) << 10) | ((accu >> 16) & 0x03FF000003FF)) + 0x0001000000010000; -    result.append({ -                  static_cast<To>(((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000), -                  static_cast<To>(((accu & 0x03FF00000000) >> 22 | ((accu >> 48) & 0x03FF)) + 0x10000) -                  }); -   } else { -    append_utf(result, ((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000); -    append_utf(result, ((accu & 0x03FF00000000) >> 22 | ((accu >> 48) & 0x03FF)) + 0x10000); -   } -   accu = 0; -   bytes_in_accu = 0; -  } else -  if (From unit0 {static_cast<From>(accu & 0xFFFF)}; is_valid_unicode<16>(unit0)) { -   append_utf<16>(result, unit0); -   accu >>= 16; -   bytes_in_accu -= 2; -  } else -  if ((accu & 0xFC00FC00) == 0xDC00D800) { -   // found 2 code units forming 1 code point in UTF-16; -   // by definition of UTF-16, we have a valid unicode value at this point -   append_utf(result, ((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000); -   accu >>= 32; -   bytes_in_accu -= 4; -  } else -   throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); - } -   // Little Endian optimized version   template<typename From, typename To, std::enable_if_t<is_encoding_v<From> && is_encoding_v<To>, bool> = true>   typename To::string_type convert_optimized_utf(const typename From::string_type& s)   {    typename To::string_type result; -  if constexpr(is_utf_32_v<typename From::value_type>) { -   for (const auto value: s) { -    if (is_valid_unicode(value)) -     append_utf(result, value); -    else -     throw std::invalid_argument("Invalid Unicode character in UTF-32"); +  uint64_t accu{}; +  int bytes_in_accu{}; + +  size_t s_index{}; +  size_t s_size{s.size()}; +  while (s_index + 8 / sizeof(typename From::value_type) <= s_size) { +   // read input +   // assume: bytes_in_accu < 8 +   accu |= (*reinterpret_cast<const uint64_t*>(&(s.data()[s_index]))) << (bytes_in_accu * 8); +   s_index += (8 - bytes_in_accu) / sizeof(typename From::value_type); +   bytes_in_accu = 8; + +   while (bytes_in_accu >= 4) { +    append_accu<typename From::value_type, typename To::value_type, true>(result, accu, bytes_in_accu);     } -#if 0 -  } else if constexpr(is_utf_16_v<typename From::value_type>) { -   for (int i = 0; i < s.size(); i++) { -    typename From::value_type unit0{s[i]}; -    if (is_valid_unicode(unit0)) { -     append_utf(result, unit0); -    } else { -     i++; -     if (i < s.size()) { -      typename From::value_type unit1 {s[i]}; -      if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) -       throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); - -      append_utf(result, (static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000); -     } else -      throw std::invalid_argument("Invalid code unit at end of UTF-16 string"); -    } -   } -#endif -  } else { -   uint64_t accu{}; -   int bytes_in_accu{}; - -   size_t s_index{}; -   size_t s_size{s.size()}; -   while (s_index + 8 / sizeof(typename From::value_type) <= s_size) { -    // read input -    // assume: bytes_in_accu < 8 -    accu |= (*reinterpret_cast<const uint64_t*>(&(s.data()[s_index]))) << (bytes_in_accu * 8); -    s_index += (8 - bytes_in_accu) / sizeof(typename From::value_type); -    bytes_in_accu = 8; - -    while (bytes_in_accu >= 4) { -     append_accu<typename From::value_type, typename To::value_type, true>(result, accu, bytes_in_accu); -    } -   } - -   // 0..3 bytes left in accu -   // 0..7 bytes left in s +  } -   while (s_index < s_size || bytes_in_accu > 0) { -    while (s_index < s_size && bytes_in_accu < 8) { -     accu |= static_cast<uint64_t>(*reinterpret_cast<const typename From::value_type*>(&(s.data()[s_index]))) << (bytes_in_accu * 8); -     ++s_index; -     bytes_in_accu += sizeof(typename From::value_type); -    } +  // 0..3 bytes left in accu +  // 0..7 bytes left in s -    append_accu<typename From::value_type, typename To::value_type, false>(result, accu, bytes_in_accu); +  while (s_index < s_size || bytes_in_accu > 0) { +   while (s_index < s_size && bytes_in_accu < 8) { +    accu |= static_cast<uint64_t>(*reinterpret_cast<const typename From::value_type*>(&(s.data()[s_index]))) << (bytes_in_accu * 8); +    ++s_index; +    bytes_in_accu += sizeof(typename From::value_type);     } + +   append_accu<typename From::value_type, typename To::value_type, false>(result, accu, bytes_in_accu);    }    return result;   } @@ -408,7 +329,7 @@ namespace unicode {   ToContainer convert(const FromContainer& s)   {    typedef UTF<utf_iterator<From>, utf_back_insert_iterator<To>> UTF_Trait; -   +    ToContainer result;    std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); diff --git a/include/unicode/type_traits.h b/include/unicode/type_traits.h index c3507e7..63c7d69 100644 --- a/include/unicode/type_traits.h +++ b/include/unicode/type_traits.h @@ -50,7 +50,7 @@ namespace unicode {   template<typename T>   struct is_utf_8   { -  static const bool value{std::is_trivial_v<T> && sizeof(T) == 1}; +  static const bool value{std::is_same_v<T, UTF_8> || (std::is_trivial_v<T> && sizeof(T) == 1)};   };   template<typename T> @@ -59,7 +59,7 @@ namespace unicode {   template<typename T>   struct is_utf_16   { -  static const bool value{std::is_trivial_v<T> && sizeof(T) == 2}; +  static const bool value{std::is_same_v<T, UTF_16> || (std::is_trivial_v<T> && sizeof(T) == 2)};   };   template<typename T> @@ -68,7 +68,7 @@ namespace unicode {   template<typename T>   struct is_utf_32   { -  static const bool value{std::is_trivial_v<T> && sizeof(T) == 4}; +  static const bool value{std::is_same_v<T, UTF_32> || (std::is_trivial_v<T> && sizeof(T) == 4)};   };   template<typename T> diff --git a/include/unicode/utf.h b/include/unicode/utf.h index 81e8f2b..046d9c6 100644 --- a/include/unicode/utf.h +++ b/include/unicode/utf.h @@ -415,11 +415,6 @@ namespace unicode {    }   }; - // Encoding for convert() - typedef UTF<utf_iterator<utf8_t>, utf_back_insert_iterator<utf8_t>> UTF_8; - typedef UTF<utf_iterator<char16_t>, utf_back_insert_iterator<char16_t>> UTF_16; - typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32; -   // Helper to get correct Encoding from char type, e.g. Encoding<typename decltype(s)::value_type>::type or Encoding_t<typename decltype(s)::value_type>   template<typename T>   struct Encoding diff --git a/include/unicode/utf_fwd.h b/include/unicode/utf_fwd.h index f3f6c52..c42dea1 100644 --- a/include/unicode/utf_fwd.h +++ b/include/unicode/utf_fwd.h @@ -2,6 +2,8 @@  // Forward declarations +#include "types.h" +  #include <string>  namespace unicode::detail { @@ -19,5 +21,10 @@ namespace unicode {   template<typename InputIt, typename OutputIt>   struct UTF; + // Encoding for convert() + typedef UTF<utf_iterator<utf8_t>, utf_back_insert_iterator<utf8_t>> UTF_8; + typedef UTF<utf_iterator<char16_t>, utf_back_insert_iterator<char16_t>> UTF_16; + typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32; +  } // namespace unicode | 
