From 721064dc293d8915fbb33d83bd983a40dcca180f Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Thu, 23 Dec 2021 13:27:34 +0100 Subject: Speed optimization --- include/unicode.h | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++-- src/test-unicode.cpp | 3 +++ 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/include/unicode.h b/include/unicode.h index 43dc44e..5774db7 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include #include @@ -203,6 +204,17 @@ namespace unicode::detail { return calculate_value(); } + utf_iterator& operator+=(size_t distance) + { + std::advance(iterator, distance); + return *this; + } + + size_t operator-(const utf_iterator& other) const + { + return iterator - other.iterator; + } + private: typename string_type::const_iterator iterator; typename string_type::const_iterator end_iterator; @@ -394,7 +406,7 @@ namespace unicode { } // return reference? - value_type operator*() + value_type operator*() const { input_type value{*m_it}; @@ -407,6 +419,17 @@ namespace unicode { return static_cast(static_cast(value)); } + iso_iterator& operator+=(size_t distance) + { + std::advance(m_it, distance); + return *this; + } + + difference_type operator-(const iso_iterator& other) const + { + return m_it - other.m_it; + } + private: iterator m_it; }; @@ -518,13 +541,59 @@ namespace unicode { typedef UTF, utf_back_insert_iterator> UTF_16; typedef UTF, utf_back_insert_iterator> UTF_32; + // std::distance doesn't work here: it is based on "output" distance of iterators + template + size_t input_distance(const Iterator& it1, const Iterator& it2) + { + return it2 - it1; + } + // From and To are facets template::value, bool> = true> typename To::string_type convert(const typename From::string_type& s) { typename To::string_type result; - std::copy(From::begin(s), From::end(s), To::back_inserter(result)); + if constexpr(sizeof(typename From::string_type::value_type) == 1 && + sizeof(typename To::value_type) == 1 && + sizeof(size_t) >= 8) { + auto begin{From::begin(s)}; + auto end{From::end(s)}; + auto back_inserter{To::back_inserter(result)}; + auto addr{reinterpret_cast(&s.data()[s.size() - input_distance(begin, end)])}; + while (input_distance(begin, end) >= 8) { + if (((uintptr_t)(void*)addr & 7) == 0) { + while (input_distance(begin, end) >= 8) { + uint64_t data{*addr}; + if ((data & 0x8080808080808080ULL) == 0ULL) { + result.append(reinterpret_cast(addr), 8); + begin += 8; + ++addr; + } else { + // just advance one code unit for now + back_inserter = *begin; + ++begin; + break; + } + } + } + + // keep up after unaligned Non-ASCII code points + while (begin!= end && (uintptr_t)(void*)(addr = reinterpret_cast(&s.data()[s.size() - input_distance(begin, end)])) & 7) { + back_inserter = *begin; + ++begin; + } + } + + // remainder < 8 bytes + while (begin != end) { + back_inserter = *begin; + ++begin; + } + + } else { + std::copy(From::begin(s), From::end(s), To::back_inserter(result)); + } return result; } diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index 59d55b9..d638cbb 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -566,6 +566,9 @@ BOOST_AUTO_TEST_CASE(convert) // deque BOOST_CHECK((unicode::convert, std::deque>(std::deque{})) == std::deque{}); BOOST_CHECK((unicode::convert, std::deque>(std::deque{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::deque{L'ä', L'ö', L'ü'})); + // yet unsupported: + //BOOST_CHECK((unicode::convert(std::deque{u8'\xc3', u8'\xa4', u8'\xc3', u8'\xb6', u8'\xc3', u8'\xbc'})) == (std::deque{u'ä', u'ö', u'ü'})); + //BOOST_CHECK((unicode::convert(std::deque{u8'\xc3', u8'\xa4', u8'\xc3', u8'\xb6', u8'\xc3', u8'\xbc'})) == (std::deque{u'ä', u'ö', u'ü'})); // deque with uint8_t, uint16_t BOOST_CHECK((unicode::convert, std::deque>(std::deque{})) == std::deque{}); -- cgit v1.2.3