diff options
| -rw-r--r-- | Makefile | 6 | ||||
| -rw-r--r-- | include/unicode.h | 30 | ||||
| -rw-r--r-- | include/unicode/type_traits.h | 2 | ||||
| -rw-r--r-- | include/unicode/utf.h | 29 | ||||
| -rw-r--r-- | include/unicode/utf_fwd.h | 23 | 
5 files changed, 61 insertions, 29 deletions
| @@ -139,7 +139,13 @@ DISTFILES= \  	   src/file.h \  	   Makefile \  	   include/unicode.h \ +	   include/unicode/endian.h \ +	   include/unicode/iso.h \ +	   include/unicode/predicate.h \ +	   include/unicode/types.h \  	   include/unicode/type_traits.h \ +	   include/unicode/utf.h \ +	   include/unicode/utf_fwd.h \             debian/control \             debian/compat \             debian/copyright \ diff --git a/include/unicode.h b/include/unicode.h index a50f525..eb872ec 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -77,15 +77,15 @@ namespace unicode {    {     if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) {      s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), accu_size / sizeof(input_value_type)); -   } else if constexpr(sizeof(input_value_type) == 1) { +   } else if constexpr(is_utf_8_v<input_value_type>) {      s.append({static_cast<typename output_string_type::value_type>(addr[0]),                static_cast<typename output_string_type::value_type>(addr[1]),                static_cast<typename output_string_type::value_type>(addr[2]),                static_cast<typename output_string_type::value_type>(addr[3])}); -   } else if constexpr(sizeof(input_value_type) == 2) { +   } else if constexpr(is_utf_16_v<input_value_type>) {      s.append({static_cast<typename output_string_type::value_type>(addr[0]),                static_cast<typename output_string_type::value_type>(addr[1])}); -   } else if constexpr(sizeof(input_value_type) == 4) { +   } else if constexpr(is_utf_32_v<input_value_type>) {      s.append({static_cast<typename output_string_type::value_type>(addr[0])});     }    } @@ -105,7 +105,7 @@ namespace unicode {    {     if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) {      s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), accu_size / sizeof(input_value_type)); -   } else if constexpr(sizeof(input_value_type) == 1) { +   } else if constexpr(is_utf_8_v<input_value_type>) {      s.append({static_cast<typename output_string_type::value_type>(addr[0]),                static_cast<typename output_string_type::value_type>(addr[1]),                static_cast<typename output_string_type::value_type>(addr[2]), @@ -114,12 +114,12 @@ namespace unicode {                static_cast<typename output_string_type::value_type>(addr[5]),                static_cast<typename output_string_type::value_type>(addr[6]),                static_cast<typename output_string_type::value_type>(addr[7])}); -   } else if constexpr(sizeof(input_value_type) == 2) { +   } else if constexpr(is_utf_16_v<input_value_type>) {      s.append({static_cast<typename output_string_type::value_type>(addr[0]),                static_cast<typename output_string_type::value_type>(addr[1]),                static_cast<typename output_string_type::value_type>(addr[2]),                static_cast<typename output_string_type::value_type>(addr[3])}); -   } else if constexpr(sizeof(input_value_type) == 4) { +   } else if constexpr(is_utf_32_v<input_value_type>) {      s.append({static_cast<typename output_string_type::value_type>(addr[0]),                static_cast<typename output_string_type::value_type>(addr[1])});     } @@ -174,7 +174,7 @@ namespace unicode {    return result;   } - template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<(sizeof(To) == 1), bool> = true> + template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<is_utf_8_v<To>, bool> = true>   inline void append_utf(std::basic_string<To>& result, const char32_t& value)   {    using From = char32_t; @@ -190,7 +190,7 @@ namespace unicode {    }   } - template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<(sizeof(To) == 2), bool> = true> + template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<is_utf_16_v<To>, bool> = true>   inline void append_utf(std::basic_string<To>& result, const char32_t& value)   {    if (bits_to_compare <= 16 || value <= 0xFFFF) { // expect value to be already valid Unicode values @@ -201,7 +201,7 @@ namespace unicode {    }   } - template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<(sizeof(To) == 4), bool> = true> + template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<is_utf_32_v<To>, bool> = true>   inline void append_utf(std::basic_string<To>& result, const char32_t& value)   {    // expect value to be already valid Unicode values (checked in input iterator) @@ -211,7 +211,7 @@ namespace unicode {   // Little Endian optimized version for UTF-8   // In block_mode, at least 4 bytes are in accu. On first call, even 8.   // otherwise, at least one code unit is in accu - template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<(sizeof(From) == 1), bool> = true> + template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<is_utf_8_v<From>, bool> = true>   inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu)   {  #if 1 @@ -265,7 +265,7 @@ namespace unicode {   // Little Endian optimized version for UTF-16   // In block_mode, at least 4 bytes are in accu. On first call, even 8.   // otherwise, at least one code unit is in accu - template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<(sizeof(From) == 2), bool> = true> + template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<is_utf_16_v<From>, bool> = true>   inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu)   {  #if 1 @@ -282,7 +282,7 @@ namespace unicode {    if ((accu & 0xFC00FC00FC00FC00) == 0xDC00D800DC00D800) {     // found 4 code units forming 3 code points in UTF-16;     // by definition of UTF-16, we have valid unicode values at this point -   if constexpr(sizeof(To) == 4) { +   if constexpr(is_utf_32_v<To>) {      //result.resize(result.size() + 2);      //*reinterpret_cast<uint64_t*>(&result[result.size() - 2]) = (((accu & 0x03FF000003FF) << 10) | ((accu >> 16) & 0x03FF000003FF)) + 0x0001000000010000;      result.append({ @@ -316,7 +316,7 @@ namespace unicode {   typename To::string_type convert_optimized_utf(const typename From::string_type& s)   {    typename To::string_type result; -  if constexpr(sizeof(typename From::value_type) == 4) { +  if constexpr(is_utf_32_v<typename From::value_type>) {     for (const auto value: s) {      if (is_valid_unicode(value))       append_utf(result, value); @@ -324,7 +324,7 @@ namespace unicode {       throw std::invalid_argument("Invalid Unicode character in UTF-32");     }  #if 0 -  } else if constexpr(sizeof(typename From::value_type) == 2) { +  } else if constexpr(is_utf_16_v<typename From::value_type>) {     for (int i = 0; i < s.size(); i++) {      typename From::value_type unit0{s[i]};      if (is_valid_unicode(unit0)) { @@ -388,7 +388,7 @@ namespace unicode {     } else {      throw std::invalid_argument("Invalid UTF input");     } -  } else if constexpr(accu_size == 8 && is_little_endian() && sizeof(typename From::value_type) == 1 && +  } else if constexpr(accu_size == 8 && is_little_endian() && is_utf_8_v<typename From::value_type> &&                        is_utf_encoding_v<From> && is_utf_encoding_v<To>) { // endian specific optimization     return convert_optimized_utf<From, To>(s);    } else if constexpr(accu_size == 4 || accu_size == 8) { // accu size specific optimization with speedup for 7bit input diff --git a/include/unicode/type_traits.h b/include/unicode/type_traits.h index 3ee1d82..c3507e7 100644 --- a/include/unicode/type_traits.h +++ b/include/unicode/type_traits.h @@ -1,6 +1,6 @@  #pragma once -#include "utf.h" +#include "utf_fwd.h"  #include <string>  #include <type_traits> diff --git a/include/unicode/utf.h b/include/unicode/utf.h index dd504a7..81e8f2b 100644 --- a/include/unicode/utf.h +++ b/include/unicode/utf.h @@ -1,5 +1,8 @@  #pragma once +#include "utf_fwd.h" +#include "type_traits.h" +  #include <list>  #include <string>  #include <stdexcept> @@ -37,7 +40,7 @@ namespace unicode::detail {           (... && is_utf8_followup_byte(bytes)); // left fold for linear evaluation from left to right   } - template<typename T, typename std::enable_if_t<(sizeof(T) == 1), bool> = true> + template<typename T, typename std::enable_if_t<is_utf_8_v<T>, bool> = true>   inline bool validate_utf(const std::basic_string<T>& s)   {    int i{}; @@ -78,7 +81,7 @@ namespace unicode::detail {    }   } - template<typename T, typename std::enable_if_t<(sizeof(T) == 2), bool> = true> + template<typename T, typename std::enable_if_t<is_utf_16_v<T>, bool> = true>   inline bool validate_utf(const std::basic_string<T>& s)   {    int i{}; @@ -95,7 +98,7 @@ namespace unicode::detail {    return true;   } - template<typename T, typename std::enable_if_t<(sizeof(T) == 4), bool> = true> + template<typename T, typename std::enable_if_t<is_utf_32_v<T>, bool> = true>   inline bool validate_utf(const std::basic_string<T>& s)   {    for (auto i: s) @@ -135,10 +138,10 @@ namespace unicode::detail {     return decode_utf8_leading_byte<sequence_length>(b) | decode_utf8_followup_byte(bytes...);   } - template<typename T, typename Container=std::basic_string<T>> + template<typename T, typename Container>   struct utf_iterator   { -  static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); +  static_assert(is_utf_8_v<T> || is_utf_16_v<T> || is_utf_32_v<T>);    typedef T value_type;    typedef char32_t internal_type; @@ -199,13 +202,13 @@ namespace unicode::detail {     }    } -  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true> +  template<class X = value_type, typename std::enable_if_t<is_utf_8_v<X>, bool> = true>    inline internal_type calculate_value()    {     return calculate_utf8_value(static_cast<utf8_t>(get_code_unit<0>()));    } -  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true> +  template<class X = value_type, typename std::enable_if_t<is_utf_16_v<X>, bool> = true>    inline internal_type calculate_value()    {     char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())}; @@ -226,7 +229,7 @@ namespace unicode::detail {     }    } -  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true> +  template<class X = value_type, typename std::enable_if_t<is_utf_32_v<X>, bool> = true>    inline internal_type calculate_value()    {     internal_type result {static_cast<internal_type>(get_code_unit<0>())}; @@ -296,10 +299,10 @@ namespace unicode::detail {     return utf8_trailing_byte<m - n - 1, From, To>(value);   } - template<typename T, typename Container=std::basic_string<T>> + template<typename T, typename Container>   struct utf_back_insert_iterator   { -  static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); +  static_assert(is_utf_8_v<T> || is_utf_16_v<T> || is_utf_32_v<T>);    typedef T value_type;    typedef char32_t internal_type; @@ -341,7 +344,7 @@ namespace unicode::detail {     }    } -  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true> +  template<class X = value_type, typename std::enable_if_t<is_utf_8_v<X>, bool> = true>    inline void append_utf(const internal_type& value)    {     using Y = internal_type; @@ -357,7 +360,7 @@ namespace unicode::detail {     }    } -  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true> +  template<class X = value_type, typename std::enable_if_t<is_utf_16_v<X>, bool> = true>    inline void append_utf(const internal_type& value)    {     if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator) @@ -368,7 +371,7 @@ namespace unicode::detail {     }    } -  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true> +  template<class X = value_type, typename std::enable_if_t<is_utf_32_v<X>, bool> = true>    inline void append_utf(const internal_type& value)    {     // expect value to be already valid Unicode values (checked in input iterator) diff --git a/include/unicode/utf_fwd.h b/include/unicode/utf_fwd.h new file mode 100644 index 0000000..f3f6c52 --- /dev/null +++ b/include/unicode/utf_fwd.h @@ -0,0 +1,23 @@ +#pragma once + +// Forward declarations + +#include <string> + +namespace unicode::detail { + + template<typename T, typename Container=std::basic_string<T>> + struct utf_iterator; + + template<typename T, typename Container=std::basic_string<T>> + struct utf_back_insert_iterator; + +} // namespace unicode::detail + +namespace unicode { + + template<typename InputIt, typename OutputIt> + struct UTF; + +} // namespace unicode + | 
