summaryrefslogtreecommitdiffhomepage
path: root/include/unicode
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2022-01-01 20:25:34 +0100
committerRoland Reichwein <mail@reichwein.it>2022-01-01 20:25:34 +0100
commit52d4375b10d920a59f1309c272a2e525feb1c25d (patch)
tree9d5417a9d214f4b0ba68b75e8908e28da46dd5c8 /include/unicode
parentae7b430afd1239947b8f8b2d9dc0ca72dbce91ac (diff)
Separated out headers files; optimizations; type traits; better naming
Diffstat (limited to 'include/unicode')
-rw-r--r--include/unicode/endian.h26
-rw-r--r--include/unicode/iso.h189
-rw-r--r--include/unicode/predicate.h21
-rw-r--r--include/unicode/type_traits.h77
-rw-r--r--include/unicode/types.h10
-rw-r--r--include/unicode/utf.h448
6 files changed, 771 insertions, 0 deletions
diff --git a/include/unicode/endian.h b/include/unicode/endian.h
new file mode 100644
index 0000000..38bc1b7
--- /dev/null
+++ b/include/unicode/endian.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#if __cplusplus >= 202002L
+#include <bit>
+#endif
+
+namespace unicode::detail {
+
+#if __cplusplus >= 202002L
+ consteval
+#else
+ constexpr uint16_t endian_value{0x0102};
+ constexpr uint8_t endian_value_1st_byte{(const uint8_t&)endian_value};
+
+ constexpr
+#endif
+ bool is_little_endian()
+ {
+#if __cplusplus >= 202002L
+ return std::endian::native == std::endian::little;
+#else
+ return endian_value_1st_byte == 0x02;
+#endif
+ }
+
+} // namespace unicode::detail
diff --git a/include/unicode/iso.h b/include/unicode/iso.h
new file mode 100644
index 0000000..9b20afd
--- /dev/null
+++ b/include/unicode/iso.h
@@ -0,0 +1,189 @@
+#pragma once
+
+#include "types.h"
+
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+
+namespace unicode::detail {
+
+ using namespace std::string_literals;
+
+ typedef std::unordered_map<iso_t, char32_t> iso_map_type;
+ typedef std::unordered_map<char32_t, iso_t> iso_map_type_reverse;
+
+ // ISO-8859-1 is lower 8-bit of Unicode, so no exceptions necessary
+ static inline iso_map_type iso_8859_1_map;
+
+ // ISO-8859-15 is lower 8-bit of Unicode, except for:
+ static inline iso_map_type iso_8859_15_map {
+ { '\xA4', U'\u20AC' }, // €
+ { '\xA6', U'\u0160' }, // Š
+ { '\xA8', U'\u0161' }, // š
+ { '\xB4', U'\u017D' }, // Ž
+ { '\xB8', U'\u017E' }, // ž
+ { '\xBC', U'\u0152' }, // Œ
+ { '\xBD', U'\u0153' }, // œ
+ { '\xBE', U'\u0178' }, // Ÿ
+ };
+
+ inline iso_map_type_reverse reverse_iso_map(const iso_map_type& map) {
+ iso_map_type_reverse result;
+ std::for_each(map.cbegin(), map.cend(),
+ [&](const iso_map_type::value_type& pair)
+ {
+ result.emplace(pair.second, pair.first);
+ result.emplace(static_cast<char32_t>(static_cast<uint8_t>(pair.first)), 0); // map invalid characters to a known non-mapped value as marker
+ });
+ return result;
+ }
+
+ static inline iso_map_type_reverse iso_8859_15_map_reverse { reverse_iso_map(iso_8859_15_map) };
+ static inline iso_map_type_reverse iso_8859_1_map_reverse { reverse_iso_map(iso_8859_1_map) };
+
+ template<unicode::detail::iso_map_type& Map=iso_8859_1_map, typename Container=std::basic_string<iso_t>>
+ struct iso_iterator {
+ typedef iso_t value_type;
+ typedef char32_t internal_type;
+ typedef char32_t& reference;
+ typedef char32_t* pointer;
+ typedef size_t difference_type;
+ typedef std::input_iterator_tag iterator_category;
+ typedef typename Container::const_iterator iterator;
+ typedef Container string_type;
+
+ iso_iterator(const iterator& it): m_it(it) {}
+
+ // pre-increment
+ iso_iterator& operator++()
+ {
+ ++m_it;
+ return *this;
+ }
+
+ bool operator!=(const iso_iterator& other) const
+ {
+ return m_it != other.m_it;
+ }
+
+ // return reference?
+ internal_type operator*() const
+ {
+ value_type value{*m_it};
+
+ if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 if needed
+ {
+ auto it{Map.find(value)};
+ if (it != Map.end())
+ return it->second;
+ }
+ return static_cast<internal_type>(static_cast<uint8_t>(value));
+ }
+
+ iso_iterator& operator+=(size_t distance)
+ {
+ std::advance(m_it, distance);
+ return *this;
+ }
+
+ difference_type operator-(const iso_iterator& other) const
+ {
+ return m_it - other.m_it;
+ }
+
+ private:
+ iterator m_it;
+ };
+
+ template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse, typename Container=std::basic_string<iso_t>>
+ struct iso_back_insert_iterator {
+ typedef iso_back_insert_iterator& reference;
+ typedef iso_back_insert_iterator* pointer;
+ typedef size_t difference_type;
+ typedef iso_t value_type;
+ typedef char32_t internal_type;
+ typedef std::output_iterator_tag iterator_category;
+ typedef Container string_type;
+
+ iso_back_insert_iterator(string_type& s): s(s) {}
+
+ iso_back_insert_iterator& operator=(const iso_back_insert_iterator& other)
+ {
+ if (std::addressof(other.s) != std::addressof(s))
+ throw std::runtime_error("iso_back_insert_iterator assignment operator actually called! Iterator should not be assigned to.");
+
+ return *this;
+ }
+
+ // no-op
+ reference operator++()
+ {
+ return *this;
+ }
+
+ // support *x = value, together with operator=()
+ reference operator*()
+ {
+ return *this;
+ }
+
+ reference operator=(const internal_type& value)
+ {
+ if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map_reverse)) // mapping back to 128 <= x <= 255 if needed
+ {
+ auto it{Map.find(value)};
+ if (it != Map.end()) {
+ if (it->second == 0) // marker for non-mappable character found
+ throw std::invalid_argument("Bad Unicode value to map to ISO 8859-15: "s + std::to_string(static_cast<uint32_t>(value)));
+ s.push_back(it->second);
+ return *this;
+ }
+ }
+
+ if (value > 255)
+ throw std::invalid_argument("Bad ISO 8859 value above 255: "s + std::to_string(static_cast<uint32_t>(value)));
+
+ s.push_back(static_cast<typename iso_back_insert_iterator::value_type>(value));
+ return *this;
+ }
+
+ private:
+ typename iso_back_insert_iterator::string_type& s;
+ };
+
+} // namespace unicode::detail
+
+namespace unicode {
+
+ using namespace detail;
+
+ // Encoding for convert() and ISO-8859-*
+ template<typename InputIt, typename OutputIt>
+ struct ISO_8859
+ {
+ typedef iso_t value_type;
+ typedef typename InputIt::string_type string_type;
+
+ static InputIt begin(const typename InputIt::string_type& s)
+ {
+ return InputIt(s.cbegin());
+ }
+
+ static InputIt end(const typename InputIt::string_type& s)
+ {
+ return InputIt(s.cend());
+ }
+
+ static OutputIt back_inserter(typename OutputIt::string_type& s)
+ {
+ return OutputIt(s);
+ }
+ };
+
+ // Encoding for convert()
+ typedef ISO_8859<iso_iterator<>, iso_back_insert_iterator<>> ISO_8859_1;
+ typedef ISO_8859<iso_iterator<iso_8859_15_map>, iso_back_insert_iterator<iso_8859_15_map_reverse>> ISO_8859_15;
+
+} // namespace unicode
+
diff --git a/include/unicode/predicate.h b/include/unicode/predicate.h
new file mode 100644
index 0000000..5f8c6a4
--- /dev/null
+++ b/include/unicode/predicate.h
@@ -0,0 +1,21 @@
+#pragma once
+
+namespace unicode {
+
+ // bits_to_compare: limit bits to consider even further than defined by T
+ // T: usually, char32_t, uint32_t etc.
+ template<size_t bits_to_compare = 32, typename T>
+ static inline bool is_valid_unicode(const T& value) noexcept
+ {
+ if constexpr(sizeof(T) == 1 || bits_to_compare <= 15)
+ return true;
+ else if constexpr(sizeof(T) == 2 || bits_to_compare <= 20)
+ //return value <= 0xD7FF || value >= 0xE000;
+ return (value & 0xF800) != 0xD800;
+ else
+ //return (value & 0xFFFFF800) != 0x0000D800 && (value >> 16) <= 0x10;
+ return value <= 0xD7FF || (value >= 0xE000 && value <= 0x10FFFF);
+ }
+
+} // namespace unicode
+
diff --git a/include/unicode/type_traits.h b/include/unicode/type_traits.h
new file mode 100644
index 0000000..3ee1d82
--- /dev/null
+++ b/include/unicode/type_traits.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "utf.h"
+
+#include <string>
+#include <type_traits>
+
+namespace unicode {
+
+ using namespace detail;
+
+ // helper traits
+
+ template<typename T>
+ struct is_encoding
+ {
+ static const bool value{std::is_empty_v<T>};
+ };
+
+ template<typename T>
+ inline constexpr bool is_encoding_v {is_encoding<T>::value};
+
+ template<typename T>
+ struct is_container
+ {
+ static const bool value{!std::is_empty_v<T>};
+ };
+
+ template<typename T>
+ inline constexpr bool is_container_v {is_container<T>::value};
+
+ template<typename T>
+ struct is_char
+ {
+ static const bool value{std::is_trivial_v<T> && std::is_scalar_v<T> && !std::is_empty_v<T>};
+ };
+
+ template<typename T>
+ inline constexpr bool is_char_v {is_char<T>::value};
+
+ template<typename T>
+ struct is_utf_encoding
+ {
+ static const bool value{std::is_same_v<T, UTF<utf_iterator<typename T::value_type>, utf_back_insert_iterator<typename T::value_type>>>};
+ };
+
+ template<typename T>
+ inline constexpr bool is_utf_encoding_v {is_utf_encoding<T>::value};
+
+ template<typename T>
+ struct is_utf_8
+ {
+ static const bool value{std::is_trivial_v<T> && sizeof(T) == 1};
+ };
+
+ template<typename T>
+ inline constexpr bool is_utf_8_v {is_utf_8<T>::value};
+
+ template<typename T>
+ struct is_utf_16
+ {
+ static const bool value{std::is_trivial_v<T> && sizeof(T) == 2};
+ };
+
+ template<typename T>
+ inline constexpr bool is_utf_16_v {is_utf_16<T>::value};
+
+ template<typename T>
+ struct is_utf_32
+ {
+ static const bool value{std::is_trivial_v<T> && sizeof(T) == 4};
+ };
+
+ template<typename T>
+ inline constexpr bool is_utf_32_v {is_utf_32<T>::value};
+
+} // namespace unicode
diff --git a/include/unicode/types.h b/include/unicode/types.h
new file mode 100644
index 0000000..a4461d7
--- /dev/null
+++ b/include/unicode/types.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#ifdef __cpp_char8_t
+// char8_t available
+ typedef char8_t utf8_t;
+#else
+ typedef char utf8_t;
+#endif
+typedef char iso_t;
+
diff --git a/include/unicode/utf.h b/include/unicode/utf.h
new file mode 100644
index 0000000..dd504a7
--- /dev/null
+++ b/include/unicode/utf.h
@@ -0,0 +1,448 @@
+#pragma once
+
+#include <list>
+#include <string>
+#include <stdexcept>
+
+namespace unicode::detail {
+
+ using namespace std::string_literals;
+
+ template<size_t sequence_length, typename value_type>
+ inline bool is_utf8_leading_byte(value_type byte) noexcept
+ {
+ static_assert(sequence_length <= 4);
+
+ if constexpr(sequence_length == 1) {
+ return !(byte & 0x80);
+ } else {
+ return (byte & static_cast<value_type>(0xFF << (7 - sequence_length))) == static_cast<value_type>(0xFF << (8 - sequence_length));
+ }
+ }
+
+ template<typename value_type>
+ inline bool is_utf8_followup_byte(value_type b) noexcept
+ {
+ return (b & 0b11000000) == 0b10000000;
+ }
+
+ template<typename value_type, typename... Tbytes>
+ inline bool is_utf8_sequence(value_type byte0, Tbytes... bytes) noexcept
+ {
+ constexpr auto sequence_length{sizeof...(Tbytes) + 1};
+
+ static_assert(sequence_length <= 4, "UTF-8 sequences of 1 through 4 code units are supported");
+
+ return is_utf8_leading_byte<sequence_length>(byte0) &&
+ (... && is_utf8_followup_byte(bytes)); // left fold for linear evaluation from left to right
+ }
+
+ template<typename T, typename std::enable_if_t<(sizeof(T) == 1), bool> = true>
+ inline bool validate_utf(const std::basic_string<T>& s)
+ {
+ int i{};
+ auto size{s.size()};
+ while (i < size) {
+ if (is_utf8_sequence(s[i])) {
+ i++;
+ } else if ((i < size - 1) && is_utf8_sequence(s[i], s[i + 1])) {
+ i += 2;
+ } else if ((i < size - 2) && is_utf8_sequence(s[i], s[i + 1], s[i + 2])) {
+ if (((s[i] & 0xF) == 0xD) && ((s[i + 1] & 0x20) == 0x20))
+ return false; // Reserved for UTF-16 surrogates: 0xD800..0xDFFF
+ i += 3;
+ } else if ((i < size - 3) && is_utf8_sequence(s[i], s[i + 1], s[i + 2], s[i + 3])) {
+ if ((((s[i] & 7) << 2) | ((s[i + 1] >> 4) & 3)) >= 0x11)
+ return false; // Unicode too big above 0x10FFFF
+ i += 4;
+ } else {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ template<typename value_type, typename... Twords>
+ inline bool is_utf16_sequence(value_type word0, Twords... words) noexcept
+ {
+ constexpr auto sequence_length{sizeof...(Twords) + 1};
+
+ static_assert(sequence_length <= 2, "UTF-16 sequences of only 1 or 2 code units are supported");
+
+ if constexpr(sequence_length == 1) {
+ return is_valid_unicode(word0);
+ } else {
+ char16_t unit0 {static_cast<char16_t>(word0)};
+ char16_t unit1 {static_cast<char16_t>((words, ...))};
+ return (unit0 & 0xFC00) == 0xD800 && (unit1 & 0xFC00) == 0xDC00;
+ }
+ }
+
+ template<typename T, typename std::enable_if_t<(sizeof(T) == 2), bool> = true>
+ inline bool validate_utf(const std::basic_string<T>& s)
+ {
+ int i{};
+ auto size{s.size()};
+ while (i < size) {
+ if (is_utf16_sequence(s[i])) {
+ i++;
+ } else if ((i < size - 1) && is_utf16_sequence(s[i], s[i + 1])) {
+ i += 2;
+ } else {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ template<typename T, typename std::enable_if_t<(sizeof(T) == 4), bool> = true>
+ inline bool validate_utf(const std::basic_string<T>& s)
+ {
+ for (auto i: s)
+ if (!is_valid_unicode(i))
+ return false;
+ return true;
+ }
+
+ template<size_t sequence_length, typename value_type>
+ inline char32_t decode_utf8_leading_byte(value_type b) noexcept
+ {
+ return static_cast<char32_t>(b & (0b1111111 >> sequence_length)) << ((sequence_length - 1) * 6);
+ }
+
+ template<typename value_type>
+ inline char32_t decode_utf8_followup_byte(value_type b) noexcept
+ {
+ return static_cast<char32_t>(b & 0b00111111);
+ }
+
+ template<typename value_type, typename... Targs>
+ inline char32_t decode_utf8_followup_byte(value_type b, Targs... bytes) noexcept
+ {
+ return decode_utf8_followup_byte(b) << (6 * sizeof...(Targs)) | decode_utf8_followup_byte(bytes...);
+ }
+
+ template<typename value_type, typename... Targs>
+ inline char32_t decode_utf8_sequence(value_type b, Targs... bytes) noexcept
+ {
+ size_t constexpr sequence_length{sizeof...(Targs) + 1};
+
+ static_assert(sequence_length <= 4);
+
+ if constexpr (sequence_length == 1)
+ return b;
+ else
+ return decode_utf8_leading_byte<sequence_length>(b) | decode_utf8_followup_byte(bytes...);
+ }
+
+ template<typename T, typename Container=std::basic_string<T>>
+ struct utf_iterator
+ {
+ static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
+
+ typedef T value_type;
+ typedef char32_t internal_type;
+ typedef char32_t& reference;
+ typedef char32_t* pointer;
+ typedef size_t difference_type;
+ typedef std::input_iterator_tag iterator_category;
+ typedef Container string_type;
+
+ utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend):
+ iterator(cbegin), end_iterator(cend)
+ {
+ }
+
+ utf_iterator(const utf_iterator& other) = default;
+ utf_iterator& operator=(const utf_iterator& other) = default;
+
+ inline size_t remaining_code_units() const noexcept
+ {
+ return std::distance(iterator, end_iterator);
+ }
+
+ template<size_t index>
+ inline value_type get_code_unit() const noexcept
+ {
+ if constexpr (std::is_same_v<Container, typename std::list<value_type>>) {
+ // std::list doesn't support it + n
+ auto it{iterator};
+ std::advance(it, index);
+ return *it;
+ } else {
+ return *(iterator + index);
+ }
+ }
+
+ template<typename... Tbytes>
+ inline internal_type calculate_utf8_value(Tbytes... bytes)
+ {
+ size_t constexpr sequence_length{sizeof...(Tbytes)};
+ static_assert(sequence_length >= 1 && sequence_length <= 4);
+
+ if constexpr(sequence_length > 1) {
+ if (remaining_code_units() < sequence_length)
+ throw std::invalid_argument("Bad input: Not enough bytes left for decoding UTF-8 sequence");
+ }
+
+ if (is_utf8_sequence(bytes...)) {
+ std::advance(iterator, sequence_length);
+ internal_type result{decode_utf8_sequence(bytes...)};
+ if (!unicode::is_valid_unicode<sequence_length * 6>(result))
+ throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result)));
+ return result;
+ } else {
+ if constexpr(sequence_length <= 3) // template recursion break condition: UTF-8 has 1..4 code units
+ return calculate_utf8_value(bytes..., static_cast<utf8_t>(get_code_unit<sequence_length>()));
+ else
+ throw std::invalid_argument("Bad UTF-8 input: Invalid 4 byte sequence");
+ }
+ }
+
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true>
+ inline internal_type calculate_value()
+ {
+ return calculate_utf8_value(static_cast<utf8_t>(get_code_unit<0>()));
+ }
+
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true>
+ inline internal_type calculate_value()
+ {
+ char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())};
+
+ if (is_valid_unicode(unit0)) { // 1 unit (BMP Basic Multilingual Plane)
+ std::advance(iterator, 1);
+ return unit0;
+ } else {
+ if (remaining_code_units() < 2)
+ throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing");
+
+ char16_t unit1 {static_cast<char16_t>(get_code_unit<1>())};
+ if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00)
+ throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates");
+
+ std::advance(iterator, 2);
+ return (static_cast<internal_type>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000;
+ }
+ }
+
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true>
+ inline internal_type calculate_value()
+ {
+ internal_type result {static_cast<internal_type>(get_code_unit<0>())};
+
+ if (!unicode::is_valid_unicode(result))
+ throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result)));
+
+ std::advance(iterator, 1);
+
+ return result;
+ }
+
+ // pre-increment
+ utf_iterator& operator++()
+ {
+ return *this;
+ }
+
+ bool operator!=(const utf_iterator& other) const
+ {
+ return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator);
+ }
+
+ internal_type operator*()
+ {
+ return calculate_value();
+ }
+
+ utf_iterator& operator+=(size_t distance)
+ {
+ std::advance(iterator, distance);
+ return *this;
+ }
+
+ size_t operator-(const utf_iterator& other) const
+ {
+ return iterator - other.iterator;
+ }
+
+ private:
+ typename string_type::const_iterator iterator;
+ typename string_type::const_iterator end_iterator;
+ };
+
+ // n is number of UTF-8 bytes in sequence
+ template<size_t n, typename From, typename To>
+ inline To utf8_byte0_of(const From& value)
+ {
+ return (value >> 6 * (n - 1)) | (0xFF << (8 - n));
+ }
+
+ // n is index of 6-bit groups, counting from bit 0
+ template<size_t n, typename From, typename To>
+ inline To utf8_trailing_byte(const From& value)
+ {
+ return ((value >> n * 6) & 0b111111) | 0b10000000;
+ }
+
+ // calculate UTF-8 sequence byte for m >= 2 bytes sequences (i.e. non-ASCII)
+ // assume value to be valid Unicode value for given byte position
+ template<size_t n, size_t m, typename From, typename To>
+ inline To utf8_byte_n_of_m(const From& value)
+ {
+ if constexpr (n == 0)
+ return utf8_byte0_of<m, From, To>(value);
+ else
+ return utf8_trailing_byte<m - n - 1, From, To>(value);
+ }
+
+ template<typename T, typename Container=std::basic_string<T>>
+ struct utf_back_insert_iterator
+ {
+ static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
+
+ typedef T value_type;
+ typedef char32_t internal_type;
+ typedef Container string_type;
+ typedef utf_back_insert_iterator& reference;
+ typedef utf_back_insert_iterator* pointer;
+ typedef size_t difference_type;
+ typedef std::output_iterator_tag iterator_category;
+
+ utf_back_insert_iterator(string_type& s): s(s) {}
+
+ utf_back_insert_iterator& operator=(const utf_back_insert_iterator& other)
+ {
+ if (std::addressof(other.s) != std::addressof(s))
+ throw std::runtime_error("utf_back_insert_iterator assignment operator actually called! Iterator should not be assigned to.");
+
+ return *this;
+ }
+
+ // no-op
+ reference operator++()
+ {
+ return *this;
+ }
+
+ // support *x = value, together with operator=()
+ reference operator*()
+ {
+ return *this;
+ }
+
+ template<typename... Args>
+ inline void append(Args&&... args)
+ {
+ if constexpr (std::is_same_v<Container, typename std::basic_string<value_type>>) {
+ s.append({args...});
+ } else {
+ (s.emplace_back(args), ...);
+ }
+ }
+
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true>
+ inline void append_utf(const internal_type& value)
+ {
+ using Y = internal_type;
+ if (value < 0x80) { // 1 byte
+ append(static_cast<value_type>(value));
+ } else if (value < 0x800) { // 2 bytes
+ append(utf8_byte_n_of_m<0,2,Y,X>(value), utf8_byte_n_of_m<1,2,Y,X>(value));
+ } else if (value < 0x10000) { // 3 bytes
+ append(utf8_byte_n_of_m<0,3,Y,X>(value), utf8_byte_n_of_m<1,3,Y,X>(value), utf8_byte_n_of_m<2,3,Y,X>(value));
+ } else { // 4 bytes
+ // expect value to be already valid Unicode values (checked in input iterator)
+ append(utf8_byte_n_of_m<0,4,Y,X>(value), utf8_byte_n_of_m<1,4,Y,X>(value), utf8_byte_n_of_m<2,4,Y,X>(value), utf8_byte_n_of_m<3,4,Y,X>(value));
+ }
+ }
+
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true>
+ inline void append_utf(const internal_type& value)
+ {
+ if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator)
+ append(static_cast<value_type>(value));
+ } else {
+ internal_type value_reduced{value - 0x10000};
+ append(static_cast<value_type>((value_reduced >> 10) + 0xD800), static_cast<value_type>((value_reduced & 0x3FF) + 0xDC00));
+ }
+ }
+
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true>
+ inline void append_utf(const internal_type& value)
+ {
+ // expect value to be already valid Unicode values (checked in input iterator)
+ append(static_cast<value_type>(value));
+ }
+
+ reference operator=(const internal_type& value)
+ {
+ append_utf(value);
+ return *this;
+ }
+
+ private:
+ typename utf_back_insert_iterator::string_type& s;
+ };
+
+} // namespace unicode::detail
+
+namespace unicode {
+
+ // Encoding for convert() and UTF-*
+ template<typename InputIt, typename OutputIt>
+ struct UTF
+ {
+ typedef typename OutputIt::value_type value_type;
+ typedef typename InputIt::string_type string_type;
+
+ static InputIt begin(const typename InputIt::string_type& s)
+ {
+ return InputIt{s.cbegin(), s.cend()};
+ }
+
+ static InputIt end(const typename InputIt::string_type& s)
+ {
+ return InputIt{s.cend(), s.cend()};
+ }
+
+ static OutputIt back_inserter(typename OutputIt::string_type& s)
+ {
+ return OutputIt(s);
+ }
+ };
+
+ // Encoding for convert()
+ typedef UTF<utf_iterator<utf8_t>, utf_back_insert_iterator<utf8_t>> UTF_8;
+ typedef UTF<utf_iterator<char16_t>, utf_back_insert_iterator<char16_t>> UTF_16;
+ typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32;
+
+ // Helper to get correct Encoding from char type, e.g. Encoding<typename decltype(s)::value_type>::type or Encoding_t<typename decltype(s)::value_type>
+ template<typename T>
+ struct Encoding
+ {
+ };
+
+ template<>
+ struct Encoding<utf8_t>
+ {
+ typedef UTF_8 type;
+ };
+
+ template<>
+ struct Encoding<char16_t>
+ {
+ typedef UTF_16 type;
+ };
+
+ template<>
+ struct Encoding<char32_t>
+ {
+ typedef UTF_32 type;
+ };
+
+ template<typename T>
+ using Encoding_t = typename Encoding<T>::type;
+
+} // namespace unicode
+