diff options
| -rw-r--r-- | include/unicode.h | 73 | ||||
| -rw-r--r-- | src/test-unicode.cpp | 3 | 
2 files changed, 74 insertions, 2 deletions
| diff --git a/include/unicode.h b/include/unicode.h index 43dc44e..5774db7 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -8,6 +8,7 @@  #pragma once  #include <algorithm> +#include <cstdint>  #include <iterator>  #include <list>  #include <memory> @@ -203,6 +204,17 @@ namespace unicode::detail {     return calculate_value();    } +  utf_iterator& operator+=(size_t distance) +  { +   std::advance(iterator, distance); +   return *this; +  } + +  size_t operator-(const utf_iterator& other) const +  { +   return iterator - other.iterator; +  } +   private:    typename string_type::const_iterator iterator;    typename string_type::const_iterator end_iterator; @@ -394,7 +406,7 @@ namespace unicode {    }    // return reference? -  value_type operator*() +  value_type operator*() const    {     input_type value{*m_it}; @@ -407,6 +419,17 @@ namespace unicode {     return static_cast<value_type>(static_cast<uint8_t>(value));    } +  iso_iterator& operator+=(size_t distance) +  { +   std::advance(m_it, distance); +   return *this; +  } + +  difference_type operator-(const iso_iterator& other) const +  { +   return m_it - other.m_it; +  } +   private:    iterator m_it;   }; @@ -518,13 +541,59 @@ namespace unicode {   typedef UTF<utf_iterator<char16_t>, utf_back_insert_iterator<char16_t>> UTF_16;   typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32; + // std::distance doesn't work here: it is based on "output" distance of iterators + template<class Iterator> + size_t input_distance(const Iterator& it1, const Iterator& it2) + { +  return it2 - it1; + } +   // From and To are facets   template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true>   typename To::string_type convert(const typename From::string_type& s)   {    typename To::string_type result; -  std::copy(From::begin(s), From::end(s), To::back_inserter(result)); +  if constexpr(sizeof(typename From::string_type::value_type) == 1 && +               sizeof(typename To::value_type) == 1 && +               sizeof(size_t) >= 8) { +   auto begin{From::begin(s)}; +   auto end{From::end(s)}; +   auto back_inserter{To::back_inserter(result)}; +   auto addr{reinterpret_cast<const uint64_t*>(&s.data()[s.size() - input_distance(begin, end)])}; +   while (input_distance(begin, end) >= 8) { +    if (((uintptr_t)(void*)addr & 7) == 0) { +     while (input_distance(begin, end) >= 8) { +      uint64_t data{*addr}; +      if ((data & 0x8080808080808080ULL) == 0ULL) { +       result.append(reinterpret_cast<const typename To::value_type*>(addr), 8); +       begin += 8; +       ++addr; +      } else { +       // just advance one code unit for now +       back_inserter = *begin; +       ++begin; +       break; +      } +     } +    } + +    // keep up after unaligned Non-ASCII code points +    while (begin!= end && (uintptr_t)(void*)(addr = reinterpret_cast<const uint64_t*>(&s.data()[s.size() - input_distance(begin, end)])) & 7) { +     back_inserter = *begin; +     ++begin; +    } +   } + +   // remainder < 8 bytes    +   while (begin != end) { +    back_inserter = *begin; +    ++begin; +   } + +  } else { +   std::copy(From::begin(s), From::end(s), To::back_inserter(result)); +  }    return result;   } diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index 59d55b9..d638cbb 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -566,6 +566,9 @@ BOOST_AUTO_TEST_CASE(convert)   // deque   BOOST_CHECK((unicode::convert<std::deque<char>, std::deque<wchar_t>>(std::deque<char>{})) == std::deque<wchar_t>{});   BOOST_CHECK((unicode::convert<std::deque<char>, std::deque<wchar_t>>(std::deque<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::deque<wchar_t>{L'ä', L'ö', L'ü'})); + // yet unsupported: + //BOOST_CHECK((unicode::convert<utf8_t, char16_t>(std::deque<utf8_t>{u8'\xc3', u8'\xa4', u8'\xc3', u8'\xb6', u8'\xc3', u8'\xbc'})) == (std::deque<char16_t>{u'ä', u'ö', u'ü'})); + //BOOST_CHECK((unicode::convert<unicode::UTF_8, unicode::UTF_16>(std::deque<utf8_t>{u8'\xc3', u8'\xa4', u8'\xc3', u8'\xb6', u8'\xc3', u8'\xbc'})) == (std::deque<char16_t>{u'ä', u'ö', u'ü'}));   // deque with uint8_t, uint16_t   BOOST_CHECK((unicode::convert<std::deque<uint8_t>, std::deque<uint16_t>>(std::deque<uint8_t>{})) == std::deque<uint16_t>{}); | 
