From 611601ec36a5603bc9c94cdac9a307c4bb07c929 Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Sun, 31 Jan 2021 19:00:34 +0100 Subject: Add facet based interface --- include/unicode.h | 221 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 203 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/unicode.h b/include/unicode.h index f31cbac..4b676bf 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -3,8 +3,10 @@ #pragma once #include +#include #include #include +#include #ifdef __cpp_char8_t // char8_t available @@ -31,7 +33,7 @@ namespace unicode::detail { template struct utf_iterator { - typedef char32_t value_type; + typedef T value_type; typedef char32_t& reference; typedef std::basic_string string_type; @@ -201,6 +203,7 @@ namespace unicode::detail { return value; } + private: typename string_type::const_iterator iterator; typename string_type::const_iterator end_iterator; @@ -211,13 +214,14 @@ namespace unicode::detail { template struct utf_back_insert_iterator { + typedef T value_type; typedef std::basic_string string_type; typedef utf_back_insert_iterator& reference; utf_back_insert_iterator(string_type& s): s(s) {} // no-op - utf_back_insert_iterator& operator++() + reference operator++() { return *this; } @@ -302,39 +306,220 @@ namespace unicode::detail { return *this; } + private: typename utf_back_insert_iterator::string_type& s; }; - template - utf_back_insert_iterator utf_back_inserter(std::basic_string& s) - { - return utf_back_insert_iterator(s); + typedef std::unordered_map iso_map_type; + typedef std::unordered_map iso_map_type_reverse; + + // ISO-8859-1 is lower 8-bit of Unicode, so no exceptions necessary + iso_map_type iso_8859_1_map; + + // ISO-8859-15 is lower 8-bit of Unicode, except for: + iso_map_type iso_8859_15_map { + { '\xA4', U'\u20AC' }, // € + { '\xA6', U'\u0160' }, // Š + { '\xA8', U'\u0161' }, // š + { '\xB4', U'\u017D' }, // Ž + { '\xB8', U'\u017E' }, // ž + { '\xBC', U'\u0152' }, // Œ + { '\xBD', U'\u0153' }, // œ + { '\xBE', U'\u0178' }, // Ÿ + }; + + iso_map_type_reverse reverse_iso_map(const iso_map_type& map) { + iso_map_type_reverse result; + std::for_each(map.cbegin(), map.cend(), + [&](const iso_map_type::value_type& pair) + { + result.emplace(pair.second, pair.first); + }); + return result; } - template - utf_iterator utf_begin(const std::basic_string& s) + iso_map_type_reverse iso_8859_15_map_reverse { reverse_iso_map(iso_8859_15_map) }; + iso_map_type_reverse iso_8859_1_map_reverse { reverse_iso_map(iso_8859_1_map) }; + +} // namespace unicode::detail + +namespace unicode { + + using namespace detail; + + template + struct iso_iterator { + typedef char32_t value_type; + typedef char32_t& reference; + typedef std::basic_string::const_iterator iterator; + + iso_iterator(const iterator& it): m_it(it) {} + + // pre-increment + iso_iterator& operator++() + { + ++m_it; + return *this; + } + + bool operator!=(const iso_iterator& other) const + { + return m_it != other.m_it; + } + + // return reference? + value_type operator*() + { + utf8_t value{*m_it}; + + if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 needed + { + auto it{Map.find(value)}; + if (it != Map.end()) + return it->second; + } + return static_cast(static_cast(value)); + } + + private: + iterator m_it; + }; + + template + struct iso_back_insert_iterator { + typedef iso_back_insert_iterator& reference; + typedef std::basic_string string_type; + + iso_back_insert_iterator(string_type& s): s(s) {} + + // no-op + reference operator++() + { + return *this; + } + + // support *x = value, together with operator=() + reference operator*() + { + return *this; + } + + reference operator=(const char32_t& value) + { + if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map_reverse)) // mapping of 128 <= x <= 255 needed + { + auto it{Map.find(value)}; + if (it != Map.end()) { + s.push_back(it->second); + return *this; + } + } + + if (value > 255) + throw std::invalid_argument("Bad Unicode value above 255: "s + std::to_string(static_cast(value))); + + s.push_back(static_cast(value)); + return *this; + } + + private: + typename iso_back_insert_iterator::string_type& s; + }; + + // Facet for convert() and ISO-8859-* + template + struct ISO_8859 + { + typedef utf8_t value_type; + + static InputIt begin(const std::basic_string& s) + { + return InputIt(s.cbegin()); + } + + static InputIt end(const std::basic_string& s) + { + return InputIt(s.cend()); + } + + static OutputIt back_inserter(std::basic_string& s) + { + return OutputIt(s); + } + }; + + // Facet for convert() and UTF-* + template + struct UTF { - return utf_iterator{s.cbegin(), s.cend()}; + typedef typename InputIt::value_type value_type; // OutputIt::value_type is the same + + static InputIt begin(const std::basic_string& s) + { + return InputIt{s.cbegin(), s.cend()}; + } + + static InputIt end(const std::basic_string& s) + { + return InputIt{s.cend(), s.cend()}; + } + + static OutputIt back_inserter(std::basic_string& s) + { + return OutputIt(s); + } + }; + + // Facet for convert() + typedef ISO_8859, iso_back_insert_iterator<>> ISO_8859_1; + typedef ISO_8859, iso_back_insert_iterator> ISO_8859_15; + + typedef UTF, utf_back_insert_iterator> UTF_8; + typedef UTF, utf_back_insert_iterator> UTF_16; + typedef UTF, utf_back_insert_iterator> UTF_32; + + // From and To are facets + template + std::basic_string convert(const std::basic_string& s) + { + std::basic_string result; + + std::copy(From::begin(s), From::end(s), To::back_inserter(result)); + + return result; } + // Helper to get correct Facet from char type, e.g. Encoding::Facet template - utf_iterator utf_end(const std::basic_string& s) + struct Encoding { - return utf_iterator{s.cend(), s.cend()}; - } + }; -} // namespace + template<> + struct Encoding + { + typedef UTF_8 Facet; + }; -namespace unicode { + template<> + struct Encoding + { + typedef UTF_16 Facet; + }; - using namespace detail; + template<> + struct Encoding + { + typedef UTF_32 Facet; + }; + // From and To are from: utf8_t, char16_t and char32_t template - std::basic_string utf_to_utf(const std::basic_string& s) + std::basic_string convert(const std::basic_string& s) { std::basic_string result; - std::copy(utf_begin(s), utf_end(s), utf_back_inserter(result)); + std::copy(Encoding::Facet::begin(s), Encoding::Facet::end(s), Encoding::Facet::back_inserter(result)); return result; } @@ -343,7 +528,7 @@ namespace unicode { bool is_valid_utf(const std::basic_string& s) { try { - std::for_each(utf_begin(s), utf_end(s), [](const T& c){}); + std::for_each(Encoding::Facet::begin(s), Encoding::Facet::end(s), [](const T& c){}); } catch(...) { return false; } -- cgit v1.2.3