From 721064dc293d8915fbb33d83bd983a40dcca180f Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Thu, 23 Dec 2021 13:27:34 +0100 Subject: Speed optimization --- include/unicode.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 2 deletions(-) (limited to 'include/unicode.h') diff --git a/include/unicode.h b/include/unicode.h index 43dc44e..5774db7 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include #include @@ -203,6 +204,17 @@ namespace unicode::detail { return calculate_value(); } + utf_iterator& operator+=(size_t distance) + { + std::advance(iterator, distance); + return *this; + } + + size_t operator-(const utf_iterator& other) const + { + return iterator - other.iterator; + } + private: typename string_type::const_iterator iterator; typename string_type::const_iterator end_iterator; @@ -394,7 +406,7 @@ namespace unicode { } // return reference? - value_type operator*() + value_type operator*() const { input_type value{*m_it}; @@ -407,6 +419,17 @@ namespace unicode { return static_cast(static_cast(value)); } + iso_iterator& operator+=(size_t distance) + { + std::advance(m_it, distance); + return *this; + } + + difference_type operator-(const iso_iterator& other) const + { + return m_it - other.m_it; + } + private: iterator m_it; }; @@ -518,13 +541,59 @@ namespace unicode { typedef UTF, utf_back_insert_iterator> UTF_16; typedef UTF, utf_back_insert_iterator> UTF_32; + // std::distance doesn't work here: it is based on "output" distance of iterators + template + size_t input_distance(const Iterator& it1, const Iterator& it2) + { + return it2 - it1; + } + // From and To are facets template::value, bool> = true> typename To::string_type convert(const typename From::string_type& s) { typename To::string_type result; - std::copy(From::begin(s), From::end(s), To::back_inserter(result)); + if constexpr(sizeof(typename From::string_type::value_type) == 1 && + sizeof(typename To::value_type) == 1 && + sizeof(size_t) >= 8) { + auto begin{From::begin(s)}; + auto end{From::end(s)}; + auto back_inserter{To::back_inserter(result)}; + auto addr{reinterpret_cast(&s.data()[s.size() - input_distance(begin, end)])}; + while (input_distance(begin, end) >= 8) { + if (((uintptr_t)(void*)addr & 7) == 0) { + while (input_distance(begin, end) >= 8) { + uint64_t data{*addr}; + if ((data & 0x8080808080808080ULL) == 0ULL) { + result.append(reinterpret_cast(addr), 8); + begin += 8; + ++addr; + } else { + // just advance one code unit for now + back_inserter = *begin; + ++begin; + break; + } + } + } + + // keep up after unaligned Non-ASCII code points + while (begin!= end && (uintptr_t)(void*)(addr = reinterpret_cast(&s.data()[s.size() - input_distance(begin, end)])) & 7) { + back_inserter = *begin; + ++begin; + } + } + + // remainder < 8 bytes + while (begin != end) { + back_inserter = *begin; + ++begin; + } + + } else { + std::copy(From::begin(s), From::end(s), To::back_inserter(result)); + } return result; } -- cgit v1.2.3