diff options
author | Roland Reichwein <mail@reichwein.it> | 2021-01-31 19:00:34 +0100 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2021-01-31 19:00:34 +0100 |
commit | 611601ec36a5603bc9c94cdac9a307c4bb07c929 (patch) | |
tree | 0b1c27d5958a2a3bdfe3c421a27f6ab528fbc3e1 /include/unicode.h | |
parent | 2ef9f51df48b14556e236d14213233e1bd7f829a (diff) |
Add facet based interface
Diffstat (limited to 'include/unicode.h')
-rw-r--r-- | include/unicode.h | 221 |
1 files changed, 203 insertions, 18 deletions
diff --git a/include/unicode.h b/include/unicode.h index f31cbac..4b676bf 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -3,8 +3,10 @@ #pragma once #include <algorithm> +#include <memory> #include <stdexcept> #include <string> +#include <unordered_map> #ifdef __cpp_char8_t // char8_t available @@ -31,7 +33,7 @@ namespace unicode::detail { template<typename T> struct utf_iterator { - typedef char32_t value_type; + typedef T value_type; typedef char32_t& reference; typedef std::basic_string<T> string_type; @@ -201,6 +203,7 @@ namespace unicode::detail { return value; } + private: typename string_type::const_iterator iterator; typename string_type::const_iterator end_iterator; @@ -211,13 +214,14 @@ namespace unicode::detail { template<typename T> struct utf_back_insert_iterator { + typedef T value_type; typedef std::basic_string<T> string_type; typedef utf_back_insert_iterator& reference; utf_back_insert_iterator(string_type& s): s(s) {} // no-op - utf_back_insert_iterator& operator++() + reference operator++() { return *this; } @@ -302,39 +306,220 @@ namespace unicode::detail { return *this; } + private: typename utf_back_insert_iterator::string_type& s; }; - template<typename T> - utf_back_insert_iterator<T> utf_back_inserter(std::basic_string<T>& s) - { - return utf_back_insert_iterator<T>(s); + typedef std::unordered_map<utf8_t, char32_t> iso_map_type; + typedef std::unordered_map<char32_t, utf8_t> iso_map_type_reverse; + + // ISO-8859-1 is lower 8-bit of Unicode, so no exceptions necessary + iso_map_type iso_8859_1_map; + + // ISO-8859-15 is lower 8-bit of Unicode, except for: + iso_map_type iso_8859_15_map { + { '\xA4', U'\u20AC' }, // € + { '\xA6', U'\u0160' }, // Š + { '\xA8', U'\u0161' }, // š + { '\xB4', U'\u017D' }, // Ž + { '\xB8', U'\u017E' }, // ž + { '\xBC', U'\u0152' }, // Œ + { '\xBD', U'\u0153' }, // œ + { '\xBE', U'\u0178' }, // Ÿ + }; + + iso_map_type_reverse reverse_iso_map(const iso_map_type& map) { + iso_map_type_reverse result; + std::for_each(map.cbegin(), map.cend(), + [&](const iso_map_type::value_type& pair) + { + result.emplace(pair.second, pair.first); + }); + return result; } - template<typename T> - utf_iterator<T> utf_begin(const std::basic_string<T>& s) + iso_map_type_reverse iso_8859_15_map_reverse { reverse_iso_map(iso_8859_15_map) }; + iso_map_type_reverse iso_8859_1_map_reverse { reverse_iso_map(iso_8859_1_map) }; + +} // namespace unicode::detail + +namespace unicode { + + using namespace detail; + + template<unicode::detail::iso_map_type& Map=iso_8859_1_map> + struct iso_iterator { + typedef char32_t value_type; + typedef char32_t& reference; + typedef std::basic_string<utf8_t>::const_iterator iterator; + + iso_iterator(const iterator& it): m_it(it) {} + + // pre-increment + iso_iterator& operator++() + { + ++m_it; + return *this; + } + + bool operator!=(const iso_iterator& other) const + { + return m_it != other.m_it; + } + + // return reference? + value_type operator*() + { + utf8_t value{*m_it}; + + if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 needed + { + auto it{Map.find(value)}; + if (it != Map.end()) + return it->second; + } + return static_cast<value_type>(static_cast<uint8_t>(value)); + } + + private: + iterator m_it; + }; + + template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse> + struct iso_back_insert_iterator { + typedef iso_back_insert_iterator& reference; + typedef std::basic_string<utf8_t> string_type; + + iso_back_insert_iterator(string_type& s): s(s) {} + + // no-op + reference operator++() + { + return *this; + } + + // support *x = value, together with operator=() + reference operator*() + { + return *this; + } + + reference operator=(const char32_t& value) + { + if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map_reverse)) // mapping of 128 <= x <= 255 needed + { + auto it{Map.find(value)}; + if (it != Map.end()) { + s.push_back(it->second); + return *this; + } + } + + if (value > 255) + throw std::invalid_argument("Bad Unicode value above 255: "s + std::to_string(static_cast<uint32_t>(value))); + + s.push_back(static_cast<utf8_t>(value)); + return *this; + } + + private: + typename iso_back_insert_iterator::string_type& s; + }; + + // Facet for convert() and ISO-8859-* + template<typename InputIt, typename OutputIt> + struct ISO_8859 + { + typedef utf8_t value_type; + + static InputIt begin(const std::basic_string<value_type>& s) + { + return InputIt(s.cbegin()); + } + + static InputIt end(const std::basic_string<value_type>& s) + { + return InputIt(s.cend()); + } + + static OutputIt back_inserter(std::basic_string<value_type>& s) + { + return OutputIt(s); + } + }; + + // Facet for convert() and UTF-* + template<typename InputIt, typename OutputIt> + struct UTF { - return utf_iterator<T>{s.cbegin(), s.cend()}; + typedef typename InputIt::value_type value_type; // OutputIt::value_type is the same + + static InputIt begin(const std::basic_string<value_type>& s) + { + return InputIt{s.cbegin(), s.cend()}; + } + + static InputIt end(const std::basic_string<value_type>& s) + { + return InputIt{s.cend(), s.cend()}; + } + + static OutputIt back_inserter(std::basic_string<value_type>& s) + { + return OutputIt(s); + } + }; + + // Facet for convert() + typedef ISO_8859<iso_iterator<>, iso_back_insert_iterator<>> ISO_8859_1; + typedef ISO_8859<iso_iterator<iso_8859_15_map>, iso_back_insert_iterator<iso_8859_15_map_reverse>> ISO_8859_15; + + typedef UTF<utf_iterator<utf8_t>, utf_back_insert_iterator<utf8_t>> UTF_8; + typedef UTF<utf_iterator<char16_t>, utf_back_insert_iterator<char16_t>> UTF_16; + typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32; + + // From and To are facets + template<typename From, typename To> + std::basic_string<typename To::value_type> convert(const std::basic_string<typename From::value_type>& s) + { + std::basic_string<typename To::value_type> result; + + std::copy(From::begin(s), From::end(s), To::back_inserter(result)); + + return result; } + // Helper to get correct Facet from char type, e.g. Encoding<typename decltype(s)::value_type>::Facet template<typename T> - utf_iterator<T> utf_end(const std::basic_string<T>& s) + struct Encoding { - return utf_iterator<T>{s.cend(), s.cend()}; - } + }; -} // namespace + template<> + struct Encoding<utf8_t> + { + typedef UTF_8 Facet; + }; -namespace unicode { + template<> + struct Encoding<char16_t> + { + typedef UTF_16 Facet; + }; - using namespace detail; + template<> + struct Encoding<char32_t> + { + typedef UTF_32 Facet; + }; + // From and To are from: utf8_t, char16_t and char32_t template<typename From, typename To> - std::basic_string<To> utf_to_utf(const std::basic_string<From>& s) + std::basic_string<To> convert(const std::basic_string<From>& s) { std::basic_string<To> result; - std::copy(utf_begin<From>(s), utf_end<From>(s), utf_back_inserter<To>(result)); + std::copy(Encoding<From>::Facet::begin(s), Encoding<From>::Facet::end(s), Encoding<To>::Facet::back_inserter(result)); return result; } @@ -343,7 +528,7 @@ namespace unicode { bool is_valid_utf(const std::basic_string<T>& s) { try { - std::for_each(utf_begin<T>(s), utf_end<T>(s), [](const T& c){}); + std::for_each(Encoding<T>::Facet::begin(s), Encoding<T>::Facet::end(s), [](const T& c){}); } catch(...) { return false; } |