diff options
author | Roland Reichwein <mail@reichwein.it> | 2022-01-01 20:25:34 +0100 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2022-01-01 20:25:34 +0100 |
commit | 52d4375b10d920a59f1309c272a2e525feb1c25d (patch) | |
tree | 9d5417a9d214f4b0ba68b75e8908e28da46dd5c8 /include/unicode/iso.h | |
parent | ae7b430afd1239947b8f8b2d9dc0ca72dbce91ac (diff) |
Separated out headers files; optimizations; type traits; better naming
Diffstat (limited to 'include/unicode/iso.h')
-rw-r--r-- | include/unicode/iso.h | 189 |
1 files changed, 189 insertions, 0 deletions
diff --git a/include/unicode/iso.h b/include/unicode/iso.h new file mode 100644 index 0000000..9b20afd --- /dev/null +++ b/include/unicode/iso.h @@ -0,0 +1,189 @@ +#pragma once + +#include "types.h" + +#include <stdexcept> +#include <string> +#include <unordered_map> + +namespace unicode::detail { + + using namespace std::string_literals; + + typedef std::unordered_map<iso_t, char32_t> iso_map_type; + typedef std::unordered_map<char32_t, iso_t> iso_map_type_reverse; + + // ISO-8859-1 is lower 8-bit of Unicode, so no exceptions necessary + static inline iso_map_type iso_8859_1_map; + + // ISO-8859-15 is lower 8-bit of Unicode, except for: + static inline iso_map_type iso_8859_15_map { + { '\xA4', U'\u20AC' }, // € + { '\xA6', U'\u0160' }, // Š + { '\xA8', U'\u0161' }, // š + { '\xB4', U'\u017D' }, // Ž + { '\xB8', U'\u017E' }, // ž + { '\xBC', U'\u0152' }, // Œ + { '\xBD', U'\u0153' }, // œ + { '\xBE', U'\u0178' }, // Ÿ + }; + + inline iso_map_type_reverse reverse_iso_map(const iso_map_type& map) { + iso_map_type_reverse result; + std::for_each(map.cbegin(), map.cend(), + [&](const iso_map_type::value_type& pair) + { + result.emplace(pair.second, pair.first); + result.emplace(static_cast<char32_t>(static_cast<uint8_t>(pair.first)), 0); // map invalid characters to a known non-mapped value as marker + }); + return result; + } + + static inline iso_map_type_reverse iso_8859_15_map_reverse { reverse_iso_map(iso_8859_15_map) }; + static inline iso_map_type_reverse iso_8859_1_map_reverse { reverse_iso_map(iso_8859_1_map) }; + + template<unicode::detail::iso_map_type& Map=iso_8859_1_map, typename Container=std::basic_string<iso_t>> + struct iso_iterator { + typedef iso_t value_type; + typedef char32_t internal_type; + typedef char32_t& reference; + typedef char32_t* pointer; + typedef size_t difference_type; + typedef std::input_iterator_tag iterator_category; + typedef typename Container::const_iterator iterator; + typedef Container string_type; + + iso_iterator(const iterator& it): m_it(it) {} + + // pre-increment + iso_iterator& operator++() + { + ++m_it; + return *this; + } + + bool operator!=(const iso_iterator& other) const + { + return m_it != other.m_it; + } + + // return reference? + internal_type operator*() const + { + value_type value{*m_it}; + + if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 if needed + { + auto it{Map.find(value)}; + if (it != Map.end()) + return it->second; + } + return static_cast<internal_type>(static_cast<uint8_t>(value)); + } + + iso_iterator& operator+=(size_t distance) + { + std::advance(m_it, distance); + return *this; + } + + difference_type operator-(const iso_iterator& other) const + { + return m_it - other.m_it; + } + + private: + iterator m_it; + }; + + template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse, typename Container=std::basic_string<iso_t>> + struct iso_back_insert_iterator { + typedef iso_back_insert_iterator& reference; + typedef iso_back_insert_iterator* pointer; + typedef size_t difference_type; + typedef iso_t value_type; + typedef char32_t internal_type; + typedef std::output_iterator_tag iterator_category; + typedef Container string_type; + + iso_back_insert_iterator(string_type& s): s(s) {} + + iso_back_insert_iterator& operator=(const iso_back_insert_iterator& other) + { + if (std::addressof(other.s) != std::addressof(s)) + throw std::runtime_error("iso_back_insert_iterator assignment operator actually called! Iterator should not be assigned to."); + + return *this; + } + + // no-op + reference operator++() + { + return *this; + } + + // support *x = value, together with operator=() + reference operator*() + { + return *this; + } + + reference operator=(const internal_type& value) + { + if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map_reverse)) // mapping back to 128 <= x <= 255 if needed + { + auto it{Map.find(value)}; + if (it != Map.end()) { + if (it->second == 0) // marker for non-mappable character found + throw std::invalid_argument("Bad Unicode value to map to ISO 8859-15: "s + std::to_string(static_cast<uint32_t>(value))); + s.push_back(it->second); + return *this; + } + } + + if (value > 255) + throw std::invalid_argument("Bad ISO 8859 value above 255: "s + std::to_string(static_cast<uint32_t>(value))); + + s.push_back(static_cast<typename iso_back_insert_iterator::value_type>(value)); + return *this; + } + + private: + typename iso_back_insert_iterator::string_type& s; + }; + +} // namespace unicode::detail + +namespace unicode { + + using namespace detail; + + // Encoding for convert() and ISO-8859-* + template<typename InputIt, typename OutputIt> + struct ISO_8859 + { + typedef iso_t value_type; + typedef typename InputIt::string_type string_type; + + static InputIt begin(const typename InputIt::string_type& s) + { + return InputIt(s.cbegin()); + } + + static InputIt end(const typename InputIt::string_type& s) + { + return InputIt(s.cend()); + } + + static OutputIt back_inserter(typename OutputIt::string_type& s) + { + return OutputIt(s); + } + }; + + // Encoding for convert() + typedef ISO_8859<iso_iterator<>, iso_back_insert_iterator<>> ISO_8859_1; + typedef ISO_8859<iso_iterator<iso_8859_15_map>, iso_back_insert_iterator<iso_8859_15_map_reverse>> ISO_8859_15; + +} // namespace unicode + |