diff options
-rw-r--r-- | include/unicode.h | 73 | ||||
-rw-r--r-- | src/test-unicode.cpp | 3 |
2 files changed, 74 insertions, 2 deletions
diff --git a/include/unicode.h b/include/unicode.h index 43dc44e..5774db7 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -8,6 +8,7 @@ #pragma once #include <algorithm> +#include <cstdint> #include <iterator> #include <list> #include <memory> @@ -203,6 +204,17 @@ namespace unicode::detail { return calculate_value(); } + utf_iterator& operator+=(size_t distance) + { + std::advance(iterator, distance); + return *this; + } + + size_t operator-(const utf_iterator& other) const + { + return iterator - other.iterator; + } + private: typename string_type::const_iterator iterator; typename string_type::const_iterator end_iterator; @@ -394,7 +406,7 @@ namespace unicode { } // return reference? - value_type operator*() + value_type operator*() const { input_type value{*m_it}; @@ -407,6 +419,17 @@ namespace unicode { return static_cast<value_type>(static_cast<uint8_t>(value)); } + iso_iterator& operator+=(size_t distance) + { + std::advance(m_it, distance); + return *this; + } + + difference_type operator-(const iso_iterator& other) const + { + return m_it - other.m_it; + } + private: iterator m_it; }; @@ -518,13 +541,59 @@ namespace unicode { typedef UTF<utf_iterator<char16_t>, utf_back_insert_iterator<char16_t>> UTF_16; typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32; + // std::distance doesn't work here: it is based on "output" distance of iterators + template<class Iterator> + size_t input_distance(const Iterator& it1, const Iterator& it2) + { + return it2 - it1; + } + // From and To are facets template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true> typename To::string_type convert(const typename From::string_type& s) { typename To::string_type result; - std::copy(From::begin(s), From::end(s), To::back_inserter(result)); + if constexpr(sizeof(typename From::string_type::value_type) == 1 && + sizeof(typename To::value_type) == 1 && + sizeof(size_t) >= 8) { + auto begin{From::begin(s)}; + auto end{From::end(s)}; + auto back_inserter{To::back_inserter(result)}; + auto addr{reinterpret_cast<const uint64_t*>(&s.data()[s.size() - input_distance(begin, end)])}; + while (input_distance(begin, end) >= 8) { + if (((uintptr_t)(void*)addr & 7) == 0) { + while (input_distance(begin, end) >= 8) { + uint64_t data{*addr}; + if ((data & 0x8080808080808080ULL) == 0ULL) { + result.append(reinterpret_cast<const typename To::value_type*>(addr), 8); + begin += 8; + ++addr; + } else { + // just advance one code unit for now + back_inserter = *begin; + ++begin; + break; + } + } + } + + // keep up after unaligned Non-ASCII code points + while (begin!= end && (uintptr_t)(void*)(addr = reinterpret_cast<const uint64_t*>(&s.data()[s.size() - input_distance(begin, end)])) & 7) { + back_inserter = *begin; + ++begin; + } + } + + // remainder < 8 bytes + while (begin != end) { + back_inserter = *begin; + ++begin; + } + + } else { + std::copy(From::begin(s), From::end(s), To::back_inserter(result)); + } return result; } diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index 59d55b9..d638cbb 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -566,6 +566,9 @@ BOOST_AUTO_TEST_CASE(convert) // deque BOOST_CHECK((unicode::convert<std::deque<char>, std::deque<wchar_t>>(std::deque<char>{})) == std::deque<wchar_t>{}); BOOST_CHECK((unicode::convert<std::deque<char>, std::deque<wchar_t>>(std::deque<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::deque<wchar_t>{L'ä', L'ö', L'ü'})); + // yet unsupported: + //BOOST_CHECK((unicode::convert<utf8_t, char16_t>(std::deque<utf8_t>{u8'\xc3', u8'\xa4', u8'\xc3', u8'\xb6', u8'\xc3', u8'\xbc'})) == (std::deque<char16_t>{u'ä', u'ö', u'ü'})); + //BOOST_CHECK((unicode::convert<unicode::UTF_8, unicode::UTF_16>(std::deque<utf8_t>{u8'\xc3', u8'\xa4', u8'\xc3', u8'\xb6', u8'\xc3', u8'\xbc'})) == (std::deque<char16_t>{u'ä', u'ö', u'ü'})); // deque with uint8_t, uint16_t BOOST_CHECK((unicode::convert<std::deque<uint8_t>, std::deque<uint16_t>>(std::deque<uint8_t>{})) == std::deque<uint16_t>{}); |