diff options
| -rw-r--r-- | include/unicode.h | 210 | 
1 files changed, 132 insertions, 78 deletions
| diff --git a/include/unicode.h b/include/unicode.h index 395f172..4064233 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -47,6 +47,80 @@ namespace unicode::detail {   using namespace std::string_literals; + template<typename value_type> + inline bool is_utf8_followup_byte(value_type b) noexcept + { +  return (b & 0b11000000) == 0b10000000; + } + + template<size_t sequence_length, typename value_type> + inline bool is_utf8_leading_byte(value_type byte) noexcept + { +  static_assert(sequence_length <= 4); + +  if constexpr(sequence_length == 1) { +   return !(byte & 0x80); +  } else { +   return (byte & static_cast<value_type>(0xFF << (7 - sequence_length))) == static_cast<value_type>(0xFF << (8 - sequence_length)); +  } + } + + template<typename value_type, typename... Tbytes> + inline bool is_utf8_sequence(value_type byte0, Tbytes... bytes) noexcept + { +  constexpr auto n{sizeof...(Tbytes) + 1}; + +  static_assert(n <= 4); + +  return is_utf8_leading_byte<n>(byte0) && +         (is_utf8_followup_byte(bytes) && ...); + } + + template<typename T> + inline bool validate_utf8(const std::basic_string<T>& s) + { +  static_assert(sizeof(T) == 1); + +  int i{}; +  auto size{s.size()}; +  while (i < size) { +   if (is_utf8_sequence(s[i])) { +    i++; +   } else if ((i < size - 1) && is_utf8_sequence(s[i], s[i + 1])) { +    i += 2; +   } else if ((i < size - 2) && is_utf8_sequence(s[i], s[i + 1], s[i + 2])) { +    if (((s[i] & 0xF) == 0xD) && ((s[i + 1] & 0x20) == 0x20)) +     return false; // Reserved for UTF-16 surrogates: 0xD800..0xDFFF +    i += 3; +   } else if ((i < size - 3) && is_utf8_sequence(s[i], s[i + 1], s[i + 2], s[i + 3])) { +    if ((((s[i] & 7) << 2) | ((s[i + 1] >> 4) & 3)) >= 0x11) +     return false; // Unicode too big above 0x10FFFF +    i += 4; +   } else { +    return false; +   } +  } +  return true; + } + + template<typename value_type> + inline char32_t continuation_value(value_type b) noexcept + { +  return static_cast<char32_t>(b & 0b00111111); + } + + template<typename value_type, typename... Targs> + inline char32_t continuation_value(value_type b, Targs... Fargs) noexcept + { +  return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...); + } + + template<size_t n, typename value_type> + inline char32_t value_byte0_of(value_type b) noexcept + { +  return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6); + } +   template<typename T, typename Container=std::basic_string<T>>   struct utf_iterator   { @@ -86,61 +160,28 @@ namespace unicode::detail {     }    } -  inline static bool is_continuation_byte(value_type b) noexcept -  { -   return (b & 0b11000000) == 0b10000000; -  } - -  template<typename... Targs> -  inline static bool is_continuation_byte(value_type b, Targs... Fargs) noexcept -  { -   return is_continuation_byte(b) && is_continuation_byte(Fargs...); -  } - -  template<size_t n> -  inline static bool is_byte0_of(value_type b) noexcept -  { -   static_assert(n >= 2 && n <= 4); - -   return (b & static_cast<value_type>(0xFF << (7 - n))) == static_cast<value_type>(0xFF << (8 - n)); -  } - -  inline static internal_type continuation_value(value_type b) noexcept -  { -   return static_cast<internal_type>(b & 0b00111111); -  } - -  template<typename... Targs> -  inline static internal_type continuation_value(value_type b, Targs... Fargs) noexcept -  { -   return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...); -  } - -  template<size_t n> -  inline static internal_type value_byte0_of(value_type b) noexcept -  { -   return static_cast<internal_type>(b & (0b1111111 >> n)) << ((n - 1) * 6); -  } -    template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>    inline internal_type calculate_value()    {     utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())}; -   if (byte0 & 0x80) { // 2-4 bytes +   if (is_utf8_sequence(byte0)) { // 1 byte: 7 bit ASCII +    std::advance(iterator, 1); +    return byte0; +   } else {      internal_type value{};      if (size_t remaining{remaining_code_units()}; remaining >= 2) {       utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())}; -     if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes +     if (is_utf8_sequence(byte0, byte1)) { // 2 bytes        value = value_byte0_of<2>(byte0) | continuation_value(byte1);        std::advance(iterator, 2);       } else if (remaining >= 3) {        utf8_t byte2 {static_cast<utf8_t>(get_code_unit<2>())}; -      if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes +      if (is_utf8_sequence(byte0, byte1, byte2)) { // 3 bytes         value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2);         std::advance(iterator, 3);        } else if (remaining >= 4) {         utf8_t byte3 {static_cast<utf8_t>(get_code_unit<3>())}; -       if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes +       if (is_utf8_sequence(byte0, byte1, byte2, byte3)) { // 4 bytes          value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3);          std::advance(iterator, 4);         } else @@ -157,9 +198,6 @@ namespace unicode::detail {       throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));      return value; -   } else { // 1 byte: 7 bit ASCII -    std::advance(iterator, 1); -    return byte0;     }    } @@ -655,56 +693,72 @@ namespace unicode {   // From and To are facets   template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true> - typename To::string_type convert(const typename From::string_type& s) + typename To::string_type convert_optimized(const typename From::string_type& s)   {    typename To::string_type result; - -  if constexpr(accu_size == 4 || accu_size == 8) { -   typedef ConvertInputOptimizer<sizeof(typename From::value_type)> input_optimizer; -   typedef ArchitectureOptimizer<accu_size, input_optimizer> arch_optimizer; - -   auto begin{From::begin(s)}; -   auto end{From::end(s)}; -   auto back_inserter{To::back_inserter(result)}; -   auto addr{reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])}; -   while (input_distance_bytes(begin, end) >= accu_size) { -    if (((uintptr_t)(void*)addr & arch_optimizer::addr_mask) == 0) { -     while (input_distance_bytes(begin, end) >= accu_size) { -      typename arch_optimizer::accu_type data{*addr}; -      if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) { -       arch_optimizer::template append(reinterpret_cast<const typename From::value_type*>(addr), result); -       begin += accu_size / sizeof(typename From::value_type); -       ++addr; -      } else { -       // just advance one code unit for now and break to trigger unoptimized -       // version until next accu boundary -       back_inserter = *begin; -       ++begin; -       break; -      } +  typedef ConvertInputOptimizer<sizeof(typename From::value_type)> input_optimizer; +  typedef ArchitectureOptimizer<accu_size, input_optimizer> arch_optimizer; + +  auto begin{From::begin(s)}; +  auto end{From::end(s)}; +  auto back_inserter{To::back_inserter(result)}; +  auto addr{reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])}; +  while (input_distance_bytes(begin, end) >= accu_size) { +   if (((uintptr_t)(void*)addr & arch_optimizer::addr_mask) == 0) { +    while (input_distance_bytes(begin, end) >= accu_size) { +     typename arch_optimizer::accu_type data{*addr}; +     if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) { +      arch_optimizer::template append(reinterpret_cast<const typename From::value_type*>(addr), result); +      begin += accu_size / sizeof(typename From::value_type); +      ++addr; +     } else { +      // just advance one code unit for now and break to trigger unoptimized +      // version until next accu boundary +      back_inserter = *begin; +      ++begin; +      break;       }      } - -    // keep up after unaligned Non-ASCII code points -    while (begin != end && (uintptr_t)(void*)(addr = reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])) & arch_optimizer::addr_mask) { -     back_inserter = *begin; -     ++begin; -    }     } -   // remainder < 8 bytes    -   while (begin != end) { +   // keep up after unaligned Non-ASCII code points +   while (begin != end && (uintptr_t)(void*)(addr = reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])) & arch_optimizer::addr_mask) {      back_inserter = *begin;      ++begin;     } +  } -  } else { -   std::copy(From::begin(s), From::end(s), To::back_inserter(result)); +  // remainder < 8 bytes    +  while (begin != end) { +   back_inserter = *begin; +   ++begin;    }    return result;   } + // From and To are facets + template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true> + typename To::string_type convert(const typename From::string_type& s) + { +  if constexpr(sizeof(typename From::value_type) == 1 && sizeof(typename To::value_type) == 1 && std::is_same_v<From, UTF_8> && std::is_same_v<To, UTF_8>) { +   if (validate_utf8<typename From::value_type>(s)) { +    if constexpr (std::is_same_v<typename From::value_type, typename To::value_type>) +     return s; +    else +     return typename To::string_type{s.begin(), s.end()}; +   } else { +    throw std::invalid_argument("Invalid UTF-8"); +   } +  } if constexpr(accu_size == 4 || accu_size == 8) { +   return convert_optimized<From, To>(s); +  } else { +   typename To::string_type result; +   std::copy(From::begin(s), From::end(s), To::back_inserter(result)); +   return result; +  } + } +   // Helper to get correct Facet from char type, e.g. Encoding<typename decltype(s)::value_type>::Facet   template<typename T>   struct Encoding | 
