diff options
Diffstat (limited to 'include')
| -rw-r--r-- | include/unicode.h | 117 | 
1 files changed, 72 insertions, 45 deletions
| diff --git a/include/unicode.h b/include/unicode.h index 171496e..6d7ef16 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -37,7 +37,7 @@ namespace unicode::detail {   using namespace std::string_literals; - template<typename T> + template<typename T, typename Container=std::basic_string<T>>   struct utf_iterator   {    static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); @@ -48,7 +48,7 @@ namespace unicode::detail {    typedef char32_t* pointer;    typedef size_t difference_type;    typedef std::input_iterator_tag iterator_category; -  typedef std::basic_string<T> string_type; +  typedef Container string_type;    utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend):     iterator(cbegin), end_iterator(cend) @@ -56,18 +56,25 @@ namespace unicode::detail {     calculate_value();    } -  utf_iterator<T>(const utf_iterator<T>& other) = default; -  utf_iterator<T>& operator=(const utf_iterator<T>& other) = default; +  utf_iterator(const utf_iterator& other) = default; +  utf_iterator& operator=(const utf_iterator& other) = default; -  size_t remaining_code_units() +  size_t remaining_code_units() const    { -   return end_iterator - iterator; +   return std::distance(iterator, end_iterator);    }    template<size_t index> -  T get_code_unit() +  T get_code_unit() const    { -   return *(iterator + index); +   if constexpr (std::is_same<Container, typename std::list<T>>::value) { +    // std::list doesn't support it + n +    auto it{iterator}; +    std::advance(it, index); +    return *it; +   } else { +    return *(iterator + index); +   }    }    inline static bool is_continuation_byte(T b) @@ -111,20 +118,20 @@ namespace unicode::detail {     if (!remaining)      return; -   utf8_t byte0 {get_code_unit<0>()}; +   utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};     if (byte0 & 0x80) { // 2-4 bytes      if (remaining >= 2) { -     utf8_t byte1 {get_code_unit<1>()}; +     utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())};       if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes        value = value_byte0_of<2>(byte0) | continuation_value(byte1);        sequence_length = 2;       } else if (remaining >= 3) { -      utf8_t byte2 {get_code_unit<2>()}; +      utf8_t byte2 {static_cast<utf8_t>(get_code_unit<2>())};        if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes         value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2);         sequence_length = 3;        } else if (remaining >= 4) { -       utf8_t byte3 {get_code_unit<3>()}; +       utf8_t byte3 {static_cast<utf8_t>(get_code_unit<3>())};         if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes          value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3);          sequence_length = 4; @@ -154,7 +161,7 @@ namespace unicode::detail {     if (!remaining)      return; -   char16_t unit0 {get_code_unit<0>()}; +   char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())};     if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane)      value = unit0; @@ -163,7 +170,7 @@ namespace unicode::detail {      if (remaining < 2)       throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); -    char16_t unit1 {get_code_unit<1>()}; +    char16_t unit1 {static_cast<char16_t>(get_code_unit<1>())};      if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00)       throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); @@ -179,7 +186,7 @@ namespace unicode::detail {     if (!remaining)      return; -   value = get_code_unit<0>(); +   value = static_cast<char32_t>(get_code_unit<0>());     if (!unicode::is_valid_unicode(value))      throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value))); @@ -202,16 +209,16 @@ namespace unicode::detail {    }    // pre-increment -  utf_iterator<T>& operator++() +  utf_iterator& operator++()    { -   iterator += sequence_length; +   std::advance(iterator, sequence_length);     calculate_value();     return *this;    } -  bool operator!=(const utf_iterator<T>& other) const +  bool operator!=(const utf_iterator& other) const    { -   return iterator != other.iterator; +   return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator);    }    reference operator*() @@ -227,13 +234,13 @@ namespace unicode::detail {    size_t sequence_length{};   }; - template<typename T> + template<typename T, typename Container=std::basic_string<T>>   struct utf_back_insert_iterator   {    static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);    typedef T value_type; -  typedef std::basic_string<T> string_type; +  typedef Container string_type;    typedef utf_back_insert_iterator& reference;    typedef utf_back_insert_iterator* pointer;    typedef size_t difference_type; @@ -378,7 +385,7 @@ namespace unicode {   using namespace detail; - template<unicode::detail::iso_map_type& Map=iso_8859_1_map> + template<unicode::detail::iso_map_type& Map=iso_8859_1_map, typename Container=std::basic_string<utf8_t>>   struct iso_iterator {    typedef utf8_t input_type;    typedef char32_t value_type; @@ -386,7 +393,8 @@ namespace unicode {    typedef char32_t* pointer;    typedef size_t difference_type;    typedef std::input_iterator_tag iterator_category; -  typedef std::basic_string<utf8_t>::const_iterator iterator; +  typedef typename Container::const_iterator iterator; +  typedef Container string_type;    iso_iterator(const iterator& it): m_it(it) {} @@ -420,14 +428,14 @@ namespace unicode {    iterator m_it;   }; - template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse> + template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse, typename Container=std::basic_string<utf8_t>>   struct iso_back_insert_iterator {    typedef iso_back_insert_iterator& reference;    typedef iso_back_insert_iterator* pointer;    typedef size_t difference_type;    typedef utf8_t value_type;    typedef std::output_iterator_tag iterator_category; -  typedef std::basic_string<utf8_t> string_type; +  typedef Container string_type;    iso_back_insert_iterator(string_type& s): s(s) {} @@ -478,18 +486,19 @@ namespace unicode {   struct ISO_8859   {    typedef utf8_t value_type; +  typedef typename InputIt::string_type string_type; -  static InputIt begin(const std::basic_string<value_type>& s) +  static InputIt begin(const typename InputIt::string_type& s)    {     return InputIt(s.cbegin());    } -  static InputIt end(const std::basic_string<value_type>& s) +  static InputIt end(const typename InputIt::string_type& s)    {     return InputIt(s.cend());    } -  static OutputIt back_inserter(std::basic_string<value_type>& s) +  static OutputIt back_inserter(typename OutputIt::string_type& s)    {     return OutputIt(s);    } @@ -499,20 +508,20 @@ namespace unicode {   template<typename InputIt, typename OutputIt>   struct UTF   { -  typedef typename InputIt::input_type input_type;    typedef typename OutputIt::value_type value_type; +  typedef typename InputIt::string_type string_type; -  static InputIt begin(const std::basic_string<input_type>& s) +  static InputIt begin(const typename InputIt::string_type& s)    {     return InputIt{s.cbegin(), s.cend()};    } -  static InputIt end(const std::basic_string<input_type>& s) +  static InputIt end(const typename InputIt::string_type& s)    {     return InputIt{s.cend(), s.cend()};    } -  static OutputIt back_inserter(std::basic_string<value_type>& s) +  static OutputIt back_inserter(typename OutputIt::string_type& s)    {     return OutputIt(s);    } @@ -527,10 +536,10 @@ namespace unicode {   typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32;   // From and To are facets - template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value && std::is_empty<To>::value, bool> = true> - std::basic_string<typename To::value_type> convert(const std::basic_string<typename From::value_type>& s) + template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true> + typename To::string_type convert(const typename From::string_type& s)   { -  std::basic_string<typename To::value_type> result; +  typename To::string_type result;    std::copy(From::begin(s), From::end(s), To::back_inserter(result)); @@ -561,27 +570,29 @@ namespace unicode {    typedef UTF_32 Facet;   }; - // From and To are from: utf8_t, char16_t and char32_t + // From and To are from: utf8_t (i.e. char or char8_t (C++20)), char16_t and char32_t, char, wchar_t, uint8_t, uint16_t, uint32_t   template<typename From, typename To, -  std::enable_if_t<std::is_trivial<From>::value && std::is_trivial<To>::value, bool> = true - > - std::basic_string<To> convert(const std::basic_string<From>& s) +  typename FromContainer=std::basic_string<From>, +  typename ToContainer=std::basic_string<To>, +  std::enable_if_t<std::is_trivial<From>::value && std::is_scalar<From>::value && !std::is_empty<From>::value, bool> = true> + ToContainer convert(const FromContainer& s)   {    typedef UTF<utf_iterator<From>, utf_back_insert_iterator<To>> UTF_Trait; -  std::basic_string<To> result; +  ToContainer result;    std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result));    return result;   } + // From and To are containers   template<typename FromContainer, typename ToContainer,    std::enable_if_t<!std::is_empty<FromContainer>::value && !std::is_empty<ToContainer>::value, bool> = true   >   ToContainer convert(const FromContainer& s)   { -  typedef UTF<utf_iterator<typename FromContainer::value_type>, utf_back_insert_iterator<typename ToContainer::value_type>> UTF_Trait; +  typedef UTF<utf_iterator<typename FromContainer::value_type, FromContainer>, utf_back_insert_iterator<typename ToContainer::value_type, ToContainer>> UTF_Trait;    ToContainer result; @@ -590,9 +601,25 @@ namespace unicode {    return result;   } + // Container version + template<typename Container, std::enable_if_t<!std::is_empty<Container>::value, bool> = true> + bool is_valid_utf(const Container& s) + { +  typedef UTF<utf_iterator<typename Container::value_type, Container>, utf_back_insert_iterator<typename Container::value_type, Container>> UTF_Trait; +   +  try { +   std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); +  } catch (const std::invalid_argument&) { +   return false; +  } +  return true; + } +   // basic type version - template<typename T> - bool is_valid_utf(const std::basic_string<T>& s) + template<typename T, +  typename Container=std::basic_string<T>, +  std::enable_if_t<std::is_trivial<T>::value && !std::is_empty<T>::value, bool> = true> + bool is_valid_utf(const Container& s)   {    typedef UTF<utf_iterator<T>, utf_back_insert_iterator<T>> UTF_Trait; @@ -605,8 +632,8 @@ namespace unicode {   }   // Facet version - template<typename Facet> - bool is_valid_utf(const std::basic_string<typename Facet::value_type>& s) + template<typename Facet, std::enable_if_t<std::is_empty<Facet>::value, bool> = true> + bool is_valid_utf(const typename Facet::string_type& s)   {    try {     std::for_each(Facet::begin(s), Facet::end(s), [](const char32_t& c){}); | 
