diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/unicode.h | 117 |
1 files changed, 72 insertions, 45 deletions
diff --git a/include/unicode.h b/include/unicode.h index 171496e..6d7ef16 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -37,7 +37,7 @@ namespace unicode::detail { using namespace std::string_literals; - template<typename T> + template<typename T, typename Container=std::basic_string<T>> struct utf_iterator { static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); @@ -48,7 +48,7 @@ namespace unicode::detail { typedef char32_t* pointer; typedef size_t difference_type; typedef std::input_iterator_tag iterator_category; - typedef std::basic_string<T> string_type; + typedef Container string_type; utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend): iterator(cbegin), end_iterator(cend) @@ -56,18 +56,25 @@ namespace unicode::detail { calculate_value(); } - utf_iterator<T>(const utf_iterator<T>& other) = default; - utf_iterator<T>& operator=(const utf_iterator<T>& other) = default; + utf_iterator(const utf_iterator& other) = default; + utf_iterator& operator=(const utf_iterator& other) = default; - size_t remaining_code_units() + size_t remaining_code_units() const { - return end_iterator - iterator; + return std::distance(iterator, end_iterator); } template<size_t index> - T get_code_unit() + T get_code_unit() const { - return *(iterator + index); + if constexpr (std::is_same<Container, typename std::list<T>>::value) { + // std::list doesn't support it + n + auto it{iterator}; + std::advance(it, index); + return *it; + } else { + return *(iterator + index); + } } inline static bool is_continuation_byte(T b) @@ -111,20 +118,20 @@ namespace unicode::detail { if (!remaining) return; - utf8_t byte0 {get_code_unit<0>()}; + utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())}; if (byte0 & 0x80) { // 2-4 bytes if (remaining >= 2) { - utf8_t byte1 {get_code_unit<1>()}; + utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())}; if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes value = value_byte0_of<2>(byte0) | continuation_value(byte1); sequence_length = 2; } else if (remaining >= 3) { - utf8_t byte2 {get_code_unit<2>()}; + utf8_t byte2 {static_cast<utf8_t>(get_code_unit<2>())}; if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2); sequence_length = 3; } else if (remaining >= 4) { - utf8_t byte3 {get_code_unit<3>()}; + utf8_t byte3 {static_cast<utf8_t>(get_code_unit<3>())}; if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3); sequence_length = 4; @@ -154,7 +161,7 @@ namespace unicode::detail { if (!remaining) return; - char16_t unit0 {get_code_unit<0>()}; + char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())}; if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane) value = unit0; @@ -163,7 +170,7 @@ namespace unicode::detail { if (remaining < 2) throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); - char16_t unit1 {get_code_unit<1>()}; + char16_t unit1 {static_cast<char16_t>(get_code_unit<1>())}; if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); @@ -179,7 +186,7 @@ namespace unicode::detail { if (!remaining) return; - value = get_code_unit<0>(); + value = static_cast<char32_t>(get_code_unit<0>()); if (!unicode::is_valid_unicode(value)) throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value))); @@ -202,16 +209,16 @@ namespace unicode::detail { } // pre-increment - utf_iterator<T>& operator++() + utf_iterator& operator++() { - iterator += sequence_length; + std::advance(iterator, sequence_length); calculate_value(); return *this; } - bool operator!=(const utf_iterator<T>& other) const + bool operator!=(const utf_iterator& other) const { - return iterator != other.iterator; + return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator); } reference operator*() @@ -227,13 +234,13 @@ namespace unicode::detail { size_t sequence_length{}; }; - template<typename T> + template<typename T, typename Container=std::basic_string<T>> struct utf_back_insert_iterator { static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); typedef T value_type; - typedef std::basic_string<T> string_type; + typedef Container string_type; typedef utf_back_insert_iterator& reference; typedef utf_back_insert_iterator* pointer; typedef size_t difference_type; @@ -378,7 +385,7 @@ namespace unicode { using namespace detail; - template<unicode::detail::iso_map_type& Map=iso_8859_1_map> + template<unicode::detail::iso_map_type& Map=iso_8859_1_map, typename Container=std::basic_string<utf8_t>> struct iso_iterator { typedef utf8_t input_type; typedef char32_t value_type; @@ -386,7 +393,8 @@ namespace unicode { typedef char32_t* pointer; typedef size_t difference_type; typedef std::input_iterator_tag iterator_category; - typedef std::basic_string<utf8_t>::const_iterator iterator; + typedef typename Container::const_iterator iterator; + typedef Container string_type; iso_iterator(const iterator& it): m_it(it) {} @@ -420,14 +428,14 @@ namespace unicode { iterator m_it; }; - template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse> + template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse, typename Container=std::basic_string<utf8_t>> struct iso_back_insert_iterator { typedef iso_back_insert_iterator& reference; typedef iso_back_insert_iterator* pointer; typedef size_t difference_type; typedef utf8_t value_type; typedef std::output_iterator_tag iterator_category; - typedef std::basic_string<utf8_t> string_type; + typedef Container string_type; iso_back_insert_iterator(string_type& s): s(s) {} @@ -478,18 +486,19 @@ namespace unicode { struct ISO_8859 { typedef utf8_t value_type; + typedef typename InputIt::string_type string_type; - static InputIt begin(const std::basic_string<value_type>& s) + static InputIt begin(const typename InputIt::string_type& s) { return InputIt(s.cbegin()); } - static InputIt end(const std::basic_string<value_type>& s) + static InputIt end(const typename InputIt::string_type& s) { return InputIt(s.cend()); } - static OutputIt back_inserter(std::basic_string<value_type>& s) + static OutputIt back_inserter(typename OutputIt::string_type& s) { return OutputIt(s); } @@ -499,20 +508,20 @@ namespace unicode { template<typename InputIt, typename OutputIt> struct UTF { - typedef typename InputIt::input_type input_type; typedef typename OutputIt::value_type value_type; + typedef typename InputIt::string_type string_type; - static InputIt begin(const std::basic_string<input_type>& s) + static InputIt begin(const typename InputIt::string_type& s) { return InputIt{s.cbegin(), s.cend()}; } - static InputIt end(const std::basic_string<input_type>& s) + static InputIt end(const typename InputIt::string_type& s) { return InputIt{s.cend(), s.cend()}; } - static OutputIt back_inserter(std::basic_string<value_type>& s) + static OutputIt back_inserter(typename OutputIt::string_type& s) { return OutputIt(s); } @@ -527,10 +536,10 @@ namespace unicode { typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32; // From and To are facets - template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value && std::is_empty<To>::value, bool> = true> - std::basic_string<typename To::value_type> convert(const std::basic_string<typename From::value_type>& s) + template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true> + typename To::string_type convert(const typename From::string_type& s) { - std::basic_string<typename To::value_type> result; + typename To::string_type result; std::copy(From::begin(s), From::end(s), To::back_inserter(result)); @@ -561,27 +570,29 @@ namespace unicode { typedef UTF_32 Facet; }; - // From and To are from: utf8_t, char16_t and char32_t + // From and To are from: utf8_t (i.e. char or char8_t (C++20)), char16_t and char32_t, char, wchar_t, uint8_t, uint16_t, uint32_t template<typename From, typename To, - std::enable_if_t<std::is_trivial<From>::value && std::is_trivial<To>::value, bool> = true - > - std::basic_string<To> convert(const std::basic_string<From>& s) + typename FromContainer=std::basic_string<From>, + typename ToContainer=std::basic_string<To>, + std::enable_if_t<std::is_trivial<From>::value && std::is_scalar<From>::value && !std::is_empty<From>::value, bool> = true> + ToContainer convert(const FromContainer& s) { typedef UTF<utf_iterator<From>, utf_back_insert_iterator<To>> UTF_Trait; - std::basic_string<To> result; + ToContainer result; std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); return result; } + // From and To are containers template<typename FromContainer, typename ToContainer, std::enable_if_t<!std::is_empty<FromContainer>::value && !std::is_empty<ToContainer>::value, bool> = true > ToContainer convert(const FromContainer& s) { - typedef UTF<utf_iterator<typename FromContainer::value_type>, utf_back_insert_iterator<typename ToContainer::value_type>> UTF_Trait; + typedef UTF<utf_iterator<typename FromContainer::value_type, FromContainer>, utf_back_insert_iterator<typename ToContainer::value_type, ToContainer>> UTF_Trait; ToContainer result; @@ -590,9 +601,25 @@ namespace unicode { return result; } + // Container version + template<typename Container, std::enable_if_t<!std::is_empty<Container>::value, bool> = true> + bool is_valid_utf(const Container& s) + { + typedef UTF<utf_iterator<typename Container::value_type, Container>, utf_back_insert_iterator<typename Container::value_type, Container>> UTF_Trait; + + try { + std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); + } catch (const std::invalid_argument&) { + return false; + } + return true; + } + // basic type version - template<typename T> - bool is_valid_utf(const std::basic_string<T>& s) + template<typename T, + typename Container=std::basic_string<T>, + std::enable_if_t<std::is_trivial<T>::value && !std::is_empty<T>::value, bool> = true> + bool is_valid_utf(const Container& s) { typedef UTF<utf_iterator<T>, utf_back_insert_iterator<T>> UTF_Trait; @@ -605,8 +632,8 @@ namespace unicode { } // Facet version - template<typename Facet> - bool is_valid_utf(const std::basic_string<typename Facet::value_type>& s) + template<typename Facet, std::enable_if_t<std::is_empty<Facet>::value, bool> = true> + bool is_valid_utf(const typename Facet::string_type& s) { try { std::for_each(Facet::begin(s), Facet::end(s), [](const char32_t& c){}); |