diff options
| author | Roland Reichwein <mail@reichwein.it> | 2021-12-25 14:38:46 +0100 | 
|---|---|---|
| committer | Roland Reichwein <mail@reichwein.it> | 2021-12-25 14:38:46 +0100 | 
| commit | 79dc9edc72c5b9fefb129fe36029d4781b1e969c (patch) | |
| tree | 9e5ff95ef84ab089c652935ae8f94758318b6dbc /include | |
| parent | 98f9132997353bb3e750e8e2db99ebd474a8dbb6 (diff) | |
Generalized type usage and optimizations
Diffstat (limited to 'include')
| -rw-r--r-- | include/unicode.h | 155 | 
1 files changed, 102 insertions, 53 deletions
| diff --git a/include/unicode.h b/include/unicode.h index 8dedb19..c2d727a 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -45,8 +45,8 @@ namespace unicode::detail {   {    static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); -  typedef T input_type; -  typedef char32_t value_type; +  typedef T value_type; +  typedef char32_t internal_type;    typedef char32_t& reference;    typedef char32_t* pointer;    typedef size_t difference_type; @@ -67,9 +67,9 @@ namespace unicode::detail {    }    template<size_t index> -  T get_code_unit() const noexcept +  value_type get_code_unit() const noexcept    { -   if constexpr (std::is_same<Container, typename std::list<T>>::value) { +   if constexpr (std::is_same<Container, typename std::list<value_type>>::value) {      // std::list doesn't support it + n      auto it{iterator};      std::advance(it, index); @@ -79,46 +79,46 @@ namespace unicode::detail {     }    } -  inline static bool is_continuation_byte(T b) noexcept +  inline static bool is_continuation_byte(value_type b) noexcept    {     return (b & 0b11000000) == 0b10000000;    }    template<typename... Targs> -  inline static bool is_continuation_byte(T b, Targs... Fargs) noexcept +  inline static bool is_continuation_byte(value_type b, Targs... Fargs) noexcept    {     return is_continuation_byte(b) && is_continuation_byte(Fargs...);    }    template<size_t n> -  inline static bool is_byte0_of(T b) noexcept +  inline static bool is_byte0_of(value_type b) noexcept    { -   return (b & static_cast<T>(0xFF << (7 - n))) == static_cast<T>(0xFF << (8 - n)); +   return (b & static_cast<value_type>(0xFF << (7 - n))) == static_cast<value_type>(0xFF << (8 - n));    } -  inline static char32_t continuation_value(T b) noexcept +  inline static internal_type continuation_value(value_type b) noexcept    { -   return static_cast<char32_t>(b & 0b00111111); +   return static_cast<internal_type>(b & 0b00111111);    }    template<typename... Targs> -  inline static char32_t continuation_value(T b, Targs... Fargs) noexcept +  inline static internal_type continuation_value(value_type b, Targs... Fargs) noexcept    {     return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...);    }    template<size_t n> -  inline static char32_t value_byte0_of(T b) noexcept +  inline static internal_type value_byte0_of(value_type b) noexcept    {     return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6);    } -  template<class X = T, typename std::enable_if<(sizeof(X) == 1), bool>::type = true> -  inline value_type calculate_value() +  template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true> +  inline internal_type calculate_value()    {     utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};     if (byte0 & 0x80) { // 2-4 bytes -    value_type value{}; +    internal_type value{};      if (size_t remaining{remaining_code_units()}; remaining >= 2) {       utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())};       if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes @@ -154,8 +154,8 @@ namespace unicode::detail {     }    } -  template<class X = T, typename std::enable_if<(sizeof(X) == 2), bool>::type = true> -  inline value_type calculate_value() +  template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true> +  inline internal_type calculate_value()    {     char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())}; @@ -175,10 +175,10 @@ namespace unicode::detail {     }    } -  template<class X = T, typename std::enable_if<(sizeof(X) == 4), bool>::type = true> -  inline value_type calculate_value() +  template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true> +  inline internal_type calculate_value()    { -   value_type result {static_cast<char32_t>(get_code_unit<0>())}; +   internal_type result {static_cast<internal_type>(get_code_unit<0>())};     if (!unicode::is_valid_unicode(result))      throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result))); @@ -199,7 +199,7 @@ namespace unicode::detail {     return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator);    } -  value_type operator*() +  internal_type operator*()    {     return calculate_value();    } @@ -256,14 +256,14 @@ namespace unicode::detail {    // n is number of UTF-8 bytes in sequence    template<size_t n> -  inline static T byte0_of(char32_t value) +  inline static value_type byte0_of(char32_t value)    {     return (value >> 6 * (n - 1)) | (0xFF << (8 - n));    }    // n is index of 6-bit groups, counting from bit 0    template<size_t n> -  inline static T trailing_byte(char32_t value) +  inline static value_type trailing_byte(char32_t value)    {     return ((value >> n * 6) & 0b111111) | 0b10000000;    } @@ -271,7 +271,7 @@ namespace unicode::detail {    // calculate UTF-8 sequence byte for m >= 2 bytes sequences (i.e. non-ASCII)    // assume value to be valid Unicode value for given byte position    template<size_t n, size_t m> -  inline static T byte_n_of_m(char32_t value) +  inline static value_type byte_n_of_m(char32_t value)    {     if constexpr (n == 0)      return byte0_of<m>(value); @@ -282,7 +282,7 @@ namespace unicode::detail {    template<typename Arg>    inline void append(Arg&& arg)    { -   if constexpr (std::is_same<Container, typename std::basic_string<T>>::value) { +   if constexpr (std::is_same<Container, typename std::basic_string<value_type>>::value) {      s.append({arg});     } else {      s.emplace_back(arg); @@ -292,7 +292,7 @@ namespace unicode::detail {    template<typename Arg, typename... Args>    inline void append(Arg&& arg, Args&&... args)    { -   if constexpr (std::is_same<Container, typename std::basic_string<T>>::value) { +   if constexpr (std::is_same<Container, typename std::basic_string<value_type>>::value) {      s.append({arg, args...});     } else {      s.emplace_back(arg); @@ -300,7 +300,7 @@ namespace unicode::detail {     }    } -  template<class X = T, typename std::enable_if<(sizeof(X) == 1), bool>::type = true> +  template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>    inline void append_utf(const char32_t& value)    {     if (value < 0x80) { // 1 byte @@ -315,18 +315,18 @@ namespace unicode::detail {      throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast<uint32_t>(value)));    } -  template<class X = T, typename std::enable_if<(sizeof(X) == 2), bool>::type = true> +  template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>    inline void append_utf(const char32_t& value)    {     if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator)      append(static_cast<value_type>(value));     } else {      char32_t value_reduced{value - 0x10000}; -    append(static_cast<T>((value_reduced >> 10) + 0xD800), static_cast<T>((value_reduced & 0x3FF) + 0xDC00)); +    append(static_cast<value_type>((value_reduced >> 10) + 0xD800), static_cast<value_type>((value_reduced & 0x3FF) + 0xDC00));     }    } -  template<class X = T, typename std::enable_if<(sizeof(X) == 4), bool>::type = true> +  template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>    inline void append_utf(const char32_t& value)    {     // expect value to be already valid Unicode values (checked in input iterator) @@ -382,8 +382,8 @@ namespace unicode {   template<unicode::detail::iso_map_type& Map=iso_8859_1_map, typename Container=std::basic_string<iso_t>>   struct iso_iterator { -  typedef iso_t input_type; -  typedef char32_t value_type; +  typedef iso_t value_type; +  typedef char32_t internal_type;    typedef char32_t& reference;    typedef char32_t* pointer;    typedef size_t difference_type; @@ -406,9 +406,9 @@ namespace unicode {    }    // return reference? -  value_type operator*() const +  internal_type operator*() const    { -   input_type value{*m_it}; +   value_type value{*m_it};     if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 needed     { @@ -416,7 +416,7 @@ namespace unicode {      if (it != Map.end())       return it->second;     } -   return static_cast<value_type>(static_cast<uint8_t>(value)); +   return static_cast<internal_type>(static_cast<uint8_t>(value));    }    iso_iterator& operator+=(size_t distance) @@ -554,28 +554,61 @@ namespace unicode {   template<> struct ConvertInputOptimizer<1>   {    static const uint32_t ascii_mask { 0x80808080 }; +  // 00112233 +  // 00112222 +  // 00111122 +  // 00111111 +  // 00001122 +  // 00001111 +  // 00000011   }; - template<int value_size> - struct ConvertOutputOptimizer {}; + template<> struct ConvertInputOptimizer<2> + { +  static const uint32_t ascii_mask { 0xFF80FF80 }; + }; +  + template<> struct ConvertInputOptimizer<4> + { +  static const uint32_t ascii_mask { 0xFFFFFF80 }; + }; +  + template<int AccuSize, class ConvertInputOptimizer> + struct ArchitectureOptimizer {}; - template<> struct ConvertOutputOptimizer<1> + template<class ConvertInputOptimizer> + struct ArchitectureOptimizer<4, ConvertInputOptimizer>   { -  template<typename input_value_type, class output_string_type, int code_units> +  typedef ConvertInputOptimizer input_optimizer; +  typedef uint32_t accu_type; +  static const size_t accu_size {4}; +  static const accu_type addr_mask {accu_size - 1}; +  static const accu_type ascii_mask { (accu_type)input_optimizer::ascii_mask }; +  static const accu_type ascii_value { 0ULL }; +   +  template<typename input_value_type, class output_string_type>    inline static void append(const input_value_type* addr, output_string_type& s)    { -   s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), code_units); +   if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { +    s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), accu_size / sizeof(input_value_type)); +   } else if constexpr(sizeof(input_value_type) == 1) { +    s.append({static_cast<typename output_string_type::value_type>(addr[0]), +              static_cast<typename output_string_type::value_type>(addr[1]), +              static_cast<typename output_string_type::value_type>(addr[2]), +              static_cast<typename output_string_type::value_type>(addr[3])}); +   } else if constexpr(sizeof(input_value_type) == 2) { +    s.append({static_cast<typename output_string_type::value_type>(addr[0]), +              static_cast<typename output_string_type::value_type>(addr[1])}); +   } else if constexpr(sizeof(input_value_type) == 4) { +    s.append({static_cast<typename output_string_type::value_type>(addr[0])}); +   }    }   }; -  - template<int AccuSize, class ConvertInputOptimizer, class ConvertOutputOptimizer> - struct ArchitectureOptimizer {}; - template<class ConvertInputOptimizer, class ConvertOutputOptimizer> - struct ArchitectureOptimizer<8, ConvertInputOptimizer, ConvertOutputOptimizer> + template<class ConvertInputOptimizer> + struct ArchitectureOptimizer<8, ConvertInputOptimizer>   {    typedef ConvertInputOptimizer input_optimizer; -  typedef ConvertOutputOptimizer output_optimizer;    typedef uint64_t accu_type;    static const size_t accu_size {8};    static const accu_type addr_mask {accu_size - 1}; @@ -585,7 +618,26 @@ namespace unicode {    template<typename input_value_type, class output_string_type>    inline static void append(const input_value_type* addr, output_string_type& s)    { -   output_optimizer::template append<input_value_type, output_string_type, accu_size>(addr, s); +   if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { +    s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), accu_size / sizeof(input_value_type)); +   } else if constexpr(sizeof(input_value_type) == 1) { +    s.append({static_cast<typename output_string_type::value_type>(addr[0]), +              static_cast<typename output_string_type::value_type>(addr[1]), +              static_cast<typename output_string_type::value_type>(addr[2]), +              static_cast<typename output_string_type::value_type>(addr[3]), +              static_cast<typename output_string_type::value_type>(addr[4]), +              static_cast<typename output_string_type::value_type>(addr[5]), +              static_cast<typename output_string_type::value_type>(addr[6]), +              static_cast<typename output_string_type::value_type>(addr[7])}); +   } else if constexpr(sizeof(input_value_type) == 2) { +    s.append({static_cast<typename output_string_type::value_type>(addr[0]), +              static_cast<typename output_string_type::value_type>(addr[1]), +              static_cast<typename output_string_type::value_type>(addr[2]), +              static_cast<typename output_string_type::value_type>(addr[3])}); +   } else if constexpr(sizeof(input_value_type) == 4) { +    s.append({static_cast<typename output_string_type::value_type>(addr[0]), +              static_cast<typename output_string_type::value_type>(addr[1])}); +   }    }   }; @@ -595,12 +647,9 @@ namespace unicode {   {    typename To::string_type result; -  if constexpr(sizeof(typename From::string_type::value_type) == 1 && -               sizeof(typename To::value_type) == 1 && -               sizeof(size_t) >= 8) { +  if constexpr(sizeof(size_t) == 4 || sizeof(size_t) == 8) {     typedef ConvertInputOptimizer<sizeof(typename From::string_type::value_type)> input_optimizer; -   typedef ConvertOutputOptimizer<sizeof(typename To::value_type)> output_optimizer; -   typedef ArchitectureOptimizer<sizeof(size_t), input_optimizer, output_optimizer> arch_optimizer; +   typedef ArchitectureOptimizer<sizeof(size_t), input_optimizer> arch_optimizer;     auto begin{From::begin(s)};     auto end{From::end(s)}; @@ -612,7 +661,7 @@ namespace unicode {        typename arch_optimizer::accu_type data{*addr};        if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) {         arch_optimizer::template append<typename From::string_type::value_type, typename To::string_type>(reinterpret_cast<const typename From::string_type::value_type*>(addr), result); -       begin += arch_optimizer::accu_size; +       begin += arch_optimizer::accu_size / sizeof(typename From::string_type::value_type);         ++addr;        } else {         // just advance one code unit for now | 
