From 268b7845af166c68b1c226f0be9ba5cf983ae91c Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Sun, 14 Feb 2021 17:50:51 +0100 Subject: Support different std containers, support different basic types --- include/unicode.h | 117 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 72 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/include/unicode.h b/include/unicode.h index 171496e..6d7ef16 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -37,7 +37,7 @@ namespace unicode::detail { using namespace std::string_literals; - template + template> struct utf_iterator { static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); @@ -48,7 +48,7 @@ namespace unicode::detail { typedef char32_t* pointer; typedef size_t difference_type; typedef std::input_iterator_tag iterator_category; - typedef std::basic_string string_type; + typedef Container string_type; utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend): iterator(cbegin), end_iterator(cend) @@ -56,18 +56,25 @@ namespace unicode::detail { calculate_value(); } - utf_iterator(const utf_iterator& other) = default; - utf_iterator& operator=(const utf_iterator& other) = default; + utf_iterator(const utf_iterator& other) = default; + utf_iterator& operator=(const utf_iterator& other) = default; - size_t remaining_code_units() + size_t remaining_code_units() const { - return end_iterator - iterator; + return std::distance(iterator, end_iterator); } template - T get_code_unit() + T get_code_unit() const { - return *(iterator + index); + if constexpr (std::is_same>::value) { + // std::list doesn't support it + n + auto it{iterator}; + std::advance(it, index); + return *it; + } else { + return *(iterator + index); + } } inline static bool is_continuation_byte(T b) @@ -111,20 +118,20 @@ namespace unicode::detail { if (!remaining) return; - utf8_t byte0 {get_code_unit<0>()}; + utf8_t byte0 {static_cast(get_code_unit<0>())}; if (byte0 & 0x80) { // 2-4 bytes if (remaining >= 2) { - utf8_t byte1 {get_code_unit<1>()}; + utf8_t byte1 {static_cast(get_code_unit<1>())}; if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes value = value_byte0_of<2>(byte0) | continuation_value(byte1); sequence_length = 2; } else if (remaining >= 3) { - utf8_t byte2 {get_code_unit<2>()}; + utf8_t byte2 {static_cast(get_code_unit<2>())}; if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2); sequence_length = 3; } else if (remaining >= 4) { - utf8_t byte3 {get_code_unit<3>()}; + utf8_t byte3 {static_cast(get_code_unit<3>())}; if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3); sequence_length = 4; @@ -154,7 +161,7 @@ namespace unicode::detail { if (!remaining) return; - char16_t unit0 {get_code_unit<0>()}; + char16_t unit0 {static_cast(get_code_unit<0>())}; if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane) value = unit0; @@ -163,7 +170,7 @@ namespace unicode::detail { if (remaining < 2) throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); - char16_t unit1 {get_code_unit<1>()}; + char16_t unit1 {static_cast(get_code_unit<1>())}; if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); @@ -179,7 +186,7 @@ namespace unicode::detail { if (!remaining) return; - value = get_code_unit<0>(); + value = static_cast(get_code_unit<0>()); if (!unicode::is_valid_unicode(value)) throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(value))); @@ -202,16 +209,16 @@ namespace unicode::detail { } // pre-increment - utf_iterator& operator++() + utf_iterator& operator++() { - iterator += sequence_length; + std::advance(iterator, sequence_length); calculate_value(); return *this; } - bool operator!=(const utf_iterator& other) const + bool operator!=(const utf_iterator& other) const { - return iterator != other.iterator; + return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator); } reference operator*() @@ -227,13 +234,13 @@ namespace unicode::detail { size_t sequence_length{}; }; - template + template> struct utf_back_insert_iterator { static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); typedef T value_type; - typedef std::basic_string string_type; + typedef Container string_type; typedef utf_back_insert_iterator& reference; typedef utf_back_insert_iterator* pointer; typedef size_t difference_type; @@ -378,7 +385,7 @@ namespace unicode { using namespace detail; - template + template> struct iso_iterator { typedef utf8_t input_type; typedef char32_t value_type; @@ -386,7 +393,8 @@ namespace unicode { typedef char32_t* pointer; typedef size_t difference_type; typedef std::input_iterator_tag iterator_category; - typedef std::basic_string::const_iterator iterator; + typedef typename Container::const_iterator iterator; + typedef Container string_type; iso_iterator(const iterator& it): m_it(it) {} @@ -420,14 +428,14 @@ namespace unicode { iterator m_it; }; - template + template> struct iso_back_insert_iterator { typedef iso_back_insert_iterator& reference; typedef iso_back_insert_iterator* pointer; typedef size_t difference_type; typedef utf8_t value_type; typedef std::output_iterator_tag iterator_category; - typedef std::basic_string string_type; + typedef Container string_type; iso_back_insert_iterator(string_type& s): s(s) {} @@ -478,18 +486,19 @@ namespace unicode { struct ISO_8859 { typedef utf8_t value_type; + typedef typename InputIt::string_type string_type; - static InputIt begin(const std::basic_string& s) + static InputIt begin(const typename InputIt::string_type& s) { return InputIt(s.cbegin()); } - static InputIt end(const std::basic_string& s) + static InputIt end(const typename InputIt::string_type& s) { return InputIt(s.cend()); } - static OutputIt back_inserter(std::basic_string& s) + static OutputIt back_inserter(typename OutputIt::string_type& s) { return OutputIt(s); } @@ -499,20 +508,20 @@ namespace unicode { template struct UTF { - typedef typename InputIt::input_type input_type; typedef typename OutputIt::value_type value_type; + typedef typename InputIt::string_type string_type; - static InputIt begin(const std::basic_string& s) + static InputIt begin(const typename InputIt::string_type& s) { return InputIt{s.cbegin(), s.cend()}; } - static InputIt end(const std::basic_string& s) + static InputIt end(const typename InputIt::string_type& s) { return InputIt{s.cend(), s.cend()}; } - static OutputIt back_inserter(std::basic_string& s) + static OutputIt back_inserter(typename OutputIt::string_type& s) { return OutputIt(s); } @@ -527,10 +536,10 @@ namespace unicode { typedef UTF, utf_back_insert_iterator> UTF_32; // From and To are facets - template::value && std::is_empty::value, bool> = true> - std::basic_string convert(const std::basic_string& s) + template::value, bool> = true> + typename To::string_type convert(const typename From::string_type& s) { - std::basic_string result; + typename To::string_type result; std::copy(From::begin(s), From::end(s), To::back_inserter(result)); @@ -561,27 +570,29 @@ namespace unicode { typedef UTF_32 Facet; }; - // From and To are from: utf8_t, char16_t and char32_t + // From and To are from: utf8_t (i.e. char or char8_t (C++20)), char16_t and char32_t, char, wchar_t, uint8_t, uint16_t, uint32_t template::value && std::is_trivial::value, bool> = true - > - std::basic_string convert(const std::basic_string& s) + typename FromContainer=std::basic_string, + typename ToContainer=std::basic_string, + std::enable_if_t::value && std::is_scalar::value && !std::is_empty::value, bool> = true> + ToContainer convert(const FromContainer& s) { typedef UTF, utf_back_insert_iterator> UTF_Trait; - std::basic_string result; + ToContainer result; std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); return result; } + // From and To are containers template::value && !std::is_empty::value, bool> = true > ToContainer convert(const FromContainer& s) { - typedef UTF, utf_back_insert_iterator> UTF_Trait; + typedef UTF, utf_back_insert_iterator> UTF_Trait; ToContainer result; @@ -590,9 +601,25 @@ namespace unicode { return result; } + // Container version + template::value, bool> = true> + bool is_valid_utf(const Container& s) + { + typedef UTF, utf_back_insert_iterator> UTF_Trait; + + try { + std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); + } catch (const std::invalid_argument&) { + return false; + } + return true; + } + // basic type version - template - bool is_valid_utf(const std::basic_string& s) + template, + std::enable_if_t::value && !std::is_empty::value, bool> = true> + bool is_valid_utf(const Container& s) { typedef UTF, utf_back_insert_iterator> UTF_Trait; @@ -605,8 +632,8 @@ namespace unicode { } // Facet version - template - bool is_valid_utf(const std::basic_string& s) + template::value, bool> = true> + bool is_valid_utf(const typename Facet::string_type& s) { try { std::for_each(Facet::begin(s), Facet::end(s), [](const char32_t& c){}); -- cgit v1.2.3