diff options
Diffstat (limited to 'include/unicode')
-rw-r--r-- | include/unicode/endian.h | 26 | ||||
-rw-r--r-- | include/unicode/iso.h | 189 | ||||
-rw-r--r-- | include/unicode/predicate.h | 21 | ||||
-rw-r--r-- | include/unicode/type_traits.h | 77 | ||||
-rw-r--r-- | include/unicode/types.h | 10 | ||||
-rw-r--r-- | include/unicode/utf.h | 448 |
6 files changed, 771 insertions, 0 deletions
diff --git a/include/unicode/endian.h b/include/unicode/endian.h new file mode 100644 index 0000000..38bc1b7 --- /dev/null +++ b/include/unicode/endian.h @@ -0,0 +1,26 @@ +#pragma once + +#if __cplusplus >= 202002L +#include <bit> +#endif + +namespace unicode::detail { + +#if __cplusplus >= 202002L + consteval +#else + constexpr uint16_t endian_value{0x0102}; + constexpr uint8_t endian_value_1st_byte{(const uint8_t&)endian_value}; + + constexpr +#endif + bool is_little_endian() + { +#if __cplusplus >= 202002L + return std::endian::native == std::endian::little; +#else + return endian_value_1st_byte == 0x02; +#endif + } + +} // namespace unicode::detail diff --git a/include/unicode/iso.h b/include/unicode/iso.h new file mode 100644 index 0000000..9b20afd --- /dev/null +++ b/include/unicode/iso.h @@ -0,0 +1,189 @@ +#pragma once + +#include "types.h" + +#include <stdexcept> +#include <string> +#include <unordered_map> + +namespace unicode::detail { + + using namespace std::string_literals; + + typedef std::unordered_map<iso_t, char32_t> iso_map_type; + typedef std::unordered_map<char32_t, iso_t> iso_map_type_reverse; + + // ISO-8859-1 is lower 8-bit of Unicode, so no exceptions necessary + static inline iso_map_type iso_8859_1_map; + + // ISO-8859-15 is lower 8-bit of Unicode, except for: + static inline iso_map_type iso_8859_15_map { + { '\xA4', U'\u20AC' }, // € + { '\xA6', U'\u0160' }, // Š + { '\xA8', U'\u0161' }, // š + { '\xB4', U'\u017D' }, // Ž + { '\xB8', U'\u017E' }, // ž + { '\xBC', U'\u0152' }, // Œ + { '\xBD', U'\u0153' }, // œ + { '\xBE', U'\u0178' }, // Ÿ + }; + + inline iso_map_type_reverse reverse_iso_map(const iso_map_type& map) { + iso_map_type_reverse result; + std::for_each(map.cbegin(), map.cend(), + [&](const iso_map_type::value_type& pair) + { + result.emplace(pair.second, pair.first); + result.emplace(static_cast<char32_t>(static_cast<uint8_t>(pair.first)), 0); // map invalid characters to a known non-mapped value as marker + }); + return result; + } + + static inline iso_map_type_reverse iso_8859_15_map_reverse { reverse_iso_map(iso_8859_15_map) }; + static inline iso_map_type_reverse iso_8859_1_map_reverse { reverse_iso_map(iso_8859_1_map) }; + + template<unicode::detail::iso_map_type& Map=iso_8859_1_map, typename Container=std::basic_string<iso_t>> + struct iso_iterator { + typedef iso_t value_type; + typedef char32_t internal_type; + typedef char32_t& reference; + typedef char32_t* pointer; + typedef size_t difference_type; + typedef std::input_iterator_tag iterator_category; + typedef typename Container::const_iterator iterator; + typedef Container string_type; + + iso_iterator(const iterator& it): m_it(it) {} + + // pre-increment + iso_iterator& operator++() + { + ++m_it; + return *this; + } + + bool operator!=(const iso_iterator& other) const + { + return m_it != other.m_it; + } + + // return reference? + internal_type operator*() const + { + value_type value{*m_it}; + + if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 if needed + { + auto it{Map.find(value)}; + if (it != Map.end()) + return it->second; + } + return static_cast<internal_type>(static_cast<uint8_t>(value)); + } + + iso_iterator& operator+=(size_t distance) + { + std::advance(m_it, distance); + return *this; + } + + difference_type operator-(const iso_iterator& other) const + { + return m_it - other.m_it; + } + + private: + iterator m_it; + }; + + template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse, typename Container=std::basic_string<iso_t>> + struct iso_back_insert_iterator { + typedef iso_back_insert_iterator& reference; + typedef iso_back_insert_iterator* pointer; + typedef size_t difference_type; + typedef iso_t value_type; + typedef char32_t internal_type; + typedef std::output_iterator_tag iterator_category; + typedef Container string_type; + + iso_back_insert_iterator(string_type& s): s(s) {} + + iso_back_insert_iterator& operator=(const iso_back_insert_iterator& other) + { + if (std::addressof(other.s) != std::addressof(s)) + throw std::runtime_error("iso_back_insert_iterator assignment operator actually called! Iterator should not be assigned to."); + + return *this; + } + + // no-op + reference operator++() + { + return *this; + } + + // support *x = value, together with operator=() + reference operator*() + { + return *this; + } + + reference operator=(const internal_type& value) + { + if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map_reverse)) // mapping back to 128 <= x <= 255 if needed + { + auto it{Map.find(value)}; + if (it != Map.end()) { + if (it->second == 0) // marker for non-mappable character found + throw std::invalid_argument("Bad Unicode value to map to ISO 8859-15: "s + std::to_string(static_cast<uint32_t>(value))); + s.push_back(it->second); + return *this; + } + } + + if (value > 255) + throw std::invalid_argument("Bad ISO 8859 value above 255: "s + std::to_string(static_cast<uint32_t>(value))); + + s.push_back(static_cast<typename iso_back_insert_iterator::value_type>(value)); + return *this; + } + + private: + typename iso_back_insert_iterator::string_type& s; + }; + +} // namespace unicode::detail + +namespace unicode { + + using namespace detail; + + // Encoding for convert() and ISO-8859-* + template<typename InputIt, typename OutputIt> + struct ISO_8859 + { + typedef iso_t value_type; + typedef typename InputIt::string_type string_type; + + static InputIt begin(const typename InputIt::string_type& s) + { + return InputIt(s.cbegin()); + } + + static InputIt end(const typename InputIt::string_type& s) + { + return InputIt(s.cend()); + } + + static OutputIt back_inserter(typename OutputIt::string_type& s) + { + return OutputIt(s); + } + }; + + // Encoding for convert() + typedef ISO_8859<iso_iterator<>, iso_back_insert_iterator<>> ISO_8859_1; + typedef ISO_8859<iso_iterator<iso_8859_15_map>, iso_back_insert_iterator<iso_8859_15_map_reverse>> ISO_8859_15; + +} // namespace unicode + diff --git a/include/unicode/predicate.h b/include/unicode/predicate.h new file mode 100644 index 0000000..5f8c6a4 --- /dev/null +++ b/include/unicode/predicate.h @@ -0,0 +1,21 @@ +#pragma once + +namespace unicode { + + // bits_to_compare: limit bits to consider even further than defined by T + // T: usually, char32_t, uint32_t etc. + template<size_t bits_to_compare = 32, typename T> + static inline bool is_valid_unicode(const T& value) noexcept + { + if constexpr(sizeof(T) == 1 || bits_to_compare <= 15) + return true; + else if constexpr(sizeof(T) == 2 || bits_to_compare <= 20) + //return value <= 0xD7FF || value >= 0xE000; + return (value & 0xF800) != 0xD800; + else + //return (value & 0xFFFFF800) != 0x0000D800 && (value >> 16) <= 0x10; + return value <= 0xD7FF || (value >= 0xE000 && value <= 0x10FFFF); + } + +} // namespace unicode + diff --git a/include/unicode/type_traits.h b/include/unicode/type_traits.h new file mode 100644 index 0000000..3ee1d82 --- /dev/null +++ b/include/unicode/type_traits.h @@ -0,0 +1,77 @@ +#pragma once + +#include "utf.h" + +#include <string> +#include <type_traits> + +namespace unicode { + + using namespace detail; + + // helper traits + + template<typename T> + struct is_encoding + { + static const bool value{std::is_empty_v<T>}; + }; + + template<typename T> + inline constexpr bool is_encoding_v {is_encoding<T>::value}; + + template<typename T> + struct is_container + { + static const bool value{!std::is_empty_v<T>}; + }; + + template<typename T> + inline constexpr bool is_container_v {is_container<T>::value}; + + template<typename T> + struct is_char + { + static const bool value{std::is_trivial_v<T> && std::is_scalar_v<T> && !std::is_empty_v<T>}; + }; + + template<typename T> + inline constexpr bool is_char_v {is_char<T>::value}; + + template<typename T> + struct is_utf_encoding + { + static const bool value{std::is_same_v<T, UTF<utf_iterator<typename T::value_type>, utf_back_insert_iterator<typename T::value_type>>>}; + }; + + template<typename T> + inline constexpr bool is_utf_encoding_v {is_utf_encoding<T>::value}; + + template<typename T> + struct is_utf_8 + { + static const bool value{std::is_trivial_v<T> && sizeof(T) == 1}; + }; + + template<typename T> + inline constexpr bool is_utf_8_v {is_utf_8<T>::value}; + + template<typename T> + struct is_utf_16 + { + static const bool value{std::is_trivial_v<T> && sizeof(T) == 2}; + }; + + template<typename T> + inline constexpr bool is_utf_16_v {is_utf_16<T>::value}; + + template<typename T> + struct is_utf_32 + { + static const bool value{std::is_trivial_v<T> && sizeof(T) == 4}; + }; + + template<typename T> + inline constexpr bool is_utf_32_v {is_utf_32<T>::value}; + +} // namespace unicode diff --git a/include/unicode/types.h b/include/unicode/types.h new file mode 100644 index 0000000..a4461d7 --- /dev/null +++ b/include/unicode/types.h @@ -0,0 +1,10 @@ +#pragma once + +#ifdef __cpp_char8_t +// char8_t available + typedef char8_t utf8_t; +#else + typedef char utf8_t; +#endif +typedef char iso_t; + diff --git a/include/unicode/utf.h b/include/unicode/utf.h new file mode 100644 index 0000000..dd504a7 --- /dev/null +++ b/include/unicode/utf.h @@ -0,0 +1,448 @@ +#pragma once + +#include <list> +#include <string> +#include <stdexcept> + +namespace unicode::detail { + + using namespace std::string_literals; + + template<size_t sequence_length, typename value_type> + inline bool is_utf8_leading_byte(value_type byte) noexcept + { + static_assert(sequence_length <= 4); + + if constexpr(sequence_length == 1) { + return !(byte & 0x80); + } else { + return (byte & static_cast<value_type>(0xFF << (7 - sequence_length))) == static_cast<value_type>(0xFF << (8 - sequence_length)); + } + } + + template<typename value_type> + inline bool is_utf8_followup_byte(value_type b) noexcept + { + return (b & 0b11000000) == 0b10000000; + } + + template<typename value_type, typename... Tbytes> + inline bool is_utf8_sequence(value_type byte0, Tbytes... bytes) noexcept + { + constexpr auto sequence_length{sizeof...(Tbytes) + 1}; + + static_assert(sequence_length <= 4, "UTF-8 sequences of 1 through 4 code units are supported"); + + return is_utf8_leading_byte<sequence_length>(byte0) && + (... && is_utf8_followup_byte(bytes)); // left fold for linear evaluation from left to right + } + + template<typename T, typename std::enable_if_t<(sizeof(T) == 1), bool> = true> + inline bool validate_utf(const std::basic_string<T>& s) + { + int i{}; + auto size{s.size()}; + while (i < size) { + if (is_utf8_sequence(s[i])) { + i++; + } else if ((i < size - 1) && is_utf8_sequence(s[i], s[i + 1])) { + i += 2; + } else if ((i < size - 2) && is_utf8_sequence(s[i], s[i + 1], s[i + 2])) { + if (((s[i] & 0xF) == 0xD) && ((s[i + 1] & 0x20) == 0x20)) + return false; // Reserved for UTF-16 surrogates: 0xD800..0xDFFF + i += 3; + } else if ((i < size - 3) && is_utf8_sequence(s[i], s[i + 1], s[i + 2], s[i + 3])) { + if ((((s[i] & 7) << 2) | ((s[i + 1] >> 4) & 3)) >= 0x11) + return false; // Unicode too big above 0x10FFFF + i += 4; + } else { + return false; + } + } + return true; + } + + template<typename value_type, typename... Twords> + inline bool is_utf16_sequence(value_type word0, Twords... words) noexcept + { + constexpr auto sequence_length{sizeof...(Twords) + 1}; + + static_assert(sequence_length <= 2, "UTF-16 sequences of only 1 or 2 code units are supported"); + + if constexpr(sequence_length == 1) { + return is_valid_unicode(word0); + } else { + char16_t unit0 {static_cast<char16_t>(word0)}; + char16_t unit1 {static_cast<char16_t>((words, ...))}; + return (unit0 & 0xFC00) == 0xD800 && (unit1 & 0xFC00) == 0xDC00; + } + } + + template<typename T, typename std::enable_if_t<(sizeof(T) == 2), bool> = true> + inline bool validate_utf(const std::basic_string<T>& s) + { + int i{}; + auto size{s.size()}; + while (i < size) { + if (is_utf16_sequence(s[i])) { + i++; + } else if ((i < size - 1) && is_utf16_sequence(s[i], s[i + 1])) { + i += 2; + } else { + return false; + } + } + return true; + } + + template<typename T, typename std::enable_if_t<(sizeof(T) == 4), bool> = true> + inline bool validate_utf(const std::basic_string<T>& s) + { + for (auto i: s) + if (!is_valid_unicode(i)) + return false; + return true; + } + + template<size_t sequence_length, typename value_type> + inline char32_t decode_utf8_leading_byte(value_type b) noexcept + { + return static_cast<char32_t>(b & (0b1111111 >> sequence_length)) << ((sequence_length - 1) * 6); + } + + template<typename value_type> + inline char32_t decode_utf8_followup_byte(value_type b) noexcept + { + return static_cast<char32_t>(b & 0b00111111); + } + + template<typename value_type, typename... Targs> + inline char32_t decode_utf8_followup_byte(value_type b, Targs... bytes) noexcept + { + return decode_utf8_followup_byte(b) << (6 * sizeof...(Targs)) | decode_utf8_followup_byte(bytes...); + } + + template<typename value_type, typename... Targs> + inline char32_t decode_utf8_sequence(value_type b, Targs... bytes) noexcept + { + size_t constexpr sequence_length{sizeof...(Targs) + 1}; + + static_assert(sequence_length <= 4); + + if constexpr (sequence_length == 1) + return b; + else + return decode_utf8_leading_byte<sequence_length>(b) | decode_utf8_followup_byte(bytes...); + } + + template<typename T, typename Container=std::basic_string<T>> + struct utf_iterator + { + static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); + + typedef T value_type; + typedef char32_t internal_type; + typedef char32_t& reference; + typedef char32_t* pointer; + typedef size_t difference_type; + typedef std::input_iterator_tag iterator_category; + typedef Container string_type; + + utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend): + iterator(cbegin), end_iterator(cend) + { + } + + utf_iterator(const utf_iterator& other) = default; + utf_iterator& operator=(const utf_iterator& other) = default; + + inline size_t remaining_code_units() const noexcept + { + return std::distance(iterator, end_iterator); + } + + template<size_t index> + inline value_type get_code_unit() const noexcept + { + if constexpr (std::is_same_v<Container, typename std::list<value_type>>) { + // std::list doesn't support it + n + auto it{iterator}; + std::advance(it, index); + return *it; + } else { + return *(iterator + index); + } + } + + template<typename... Tbytes> + inline internal_type calculate_utf8_value(Tbytes... bytes) + { + size_t constexpr sequence_length{sizeof...(Tbytes)}; + static_assert(sequence_length >= 1 && sequence_length <= 4); + + if constexpr(sequence_length > 1) { + if (remaining_code_units() < sequence_length) + throw std::invalid_argument("Bad input: Not enough bytes left for decoding UTF-8 sequence"); + } + + if (is_utf8_sequence(bytes...)) { + std::advance(iterator, sequence_length); + internal_type result{decode_utf8_sequence(bytes...)}; + if (!unicode::is_valid_unicode<sequence_length * 6>(result)) + throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result))); + return result; + } else { + if constexpr(sequence_length <= 3) // template recursion break condition: UTF-8 has 1..4 code units + return calculate_utf8_value(bytes..., static_cast<utf8_t>(get_code_unit<sequence_length>())); + else + throw std::invalid_argument("Bad UTF-8 input: Invalid 4 byte sequence"); + } + } + + template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true> + inline internal_type calculate_value() + { + return calculate_utf8_value(static_cast<utf8_t>(get_code_unit<0>())); + } + + template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true> + inline internal_type calculate_value() + { + char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())}; + + if (is_valid_unicode(unit0)) { // 1 unit (BMP Basic Multilingual Plane) + std::advance(iterator, 1); + return unit0; + } else { + if (remaining_code_units() < 2) + throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); + + char16_t unit1 {static_cast<char16_t>(get_code_unit<1>())}; + if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) + throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); + + std::advance(iterator, 2); + return (static_cast<internal_type>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000; + } + } + + template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true> + inline internal_type calculate_value() + { + internal_type result {static_cast<internal_type>(get_code_unit<0>())}; + + if (!unicode::is_valid_unicode(result)) + throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result))); + + std::advance(iterator, 1); + + return result; + } + + // pre-increment + utf_iterator& operator++() + { + return *this; + } + + bool operator!=(const utf_iterator& other) const + { + return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator); + } + + internal_type operator*() + { + return calculate_value(); + } + + utf_iterator& operator+=(size_t distance) + { + std::advance(iterator, distance); + return *this; + } + + size_t operator-(const utf_iterator& other) const + { + return iterator - other.iterator; + } + + private: + typename string_type::const_iterator iterator; + typename string_type::const_iterator end_iterator; + }; + + // n is number of UTF-8 bytes in sequence + template<size_t n, typename From, typename To> + inline To utf8_byte0_of(const From& value) + { + return (value >> 6 * (n - 1)) | (0xFF << (8 - n)); + } + + // n is index of 6-bit groups, counting from bit 0 + template<size_t n, typename From, typename To> + inline To utf8_trailing_byte(const From& value) + { + return ((value >> n * 6) & 0b111111) | 0b10000000; + } + + // calculate UTF-8 sequence byte for m >= 2 bytes sequences (i.e. non-ASCII) + // assume value to be valid Unicode value for given byte position + template<size_t n, size_t m, typename From, typename To> + inline To utf8_byte_n_of_m(const From& value) + { + if constexpr (n == 0) + return utf8_byte0_of<m, From, To>(value); + else + return utf8_trailing_byte<m - n - 1, From, To>(value); + } + + template<typename T, typename Container=std::basic_string<T>> + struct utf_back_insert_iterator + { + static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); + + typedef T value_type; + typedef char32_t internal_type; + typedef Container string_type; + typedef utf_back_insert_iterator& reference; + typedef utf_back_insert_iterator* pointer; + typedef size_t difference_type; + typedef std::output_iterator_tag iterator_category; + + utf_back_insert_iterator(string_type& s): s(s) {} + + utf_back_insert_iterator& operator=(const utf_back_insert_iterator& other) + { + if (std::addressof(other.s) != std::addressof(s)) + throw std::runtime_error("utf_back_insert_iterator assignment operator actually called! Iterator should not be assigned to."); + + return *this; + } + + // no-op + reference operator++() + { + return *this; + } + + // support *x = value, together with operator=() + reference operator*() + { + return *this; + } + + template<typename... Args> + inline void append(Args&&... args) + { + if constexpr (std::is_same_v<Container, typename std::basic_string<value_type>>) { + s.append({args...}); + } else { + (s.emplace_back(args), ...); + } + } + + template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true> + inline void append_utf(const internal_type& value) + { + using Y = internal_type; + if (value < 0x80) { // 1 byte + append(static_cast<value_type>(value)); + } else if (value < 0x800) { // 2 bytes + append(utf8_byte_n_of_m<0,2,Y,X>(value), utf8_byte_n_of_m<1,2,Y,X>(value)); + } else if (value < 0x10000) { // 3 bytes + append(utf8_byte_n_of_m<0,3,Y,X>(value), utf8_byte_n_of_m<1,3,Y,X>(value), utf8_byte_n_of_m<2,3,Y,X>(value)); + } else { // 4 bytes + // expect value to be already valid Unicode values (checked in input iterator) + append(utf8_byte_n_of_m<0,4,Y,X>(value), utf8_byte_n_of_m<1,4,Y,X>(value), utf8_byte_n_of_m<2,4,Y,X>(value), utf8_byte_n_of_m<3,4,Y,X>(value)); + } + } + + template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true> + inline void append_utf(const internal_type& value) + { + if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator) + append(static_cast<value_type>(value)); + } else { + internal_type value_reduced{value - 0x10000}; + append(static_cast<value_type>((value_reduced >> 10) + 0xD800), static_cast<value_type>((value_reduced & 0x3FF) + 0xDC00)); + } + } + + template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true> + inline void append_utf(const internal_type& value) + { + // expect value to be already valid Unicode values (checked in input iterator) + append(static_cast<value_type>(value)); + } + + reference operator=(const internal_type& value) + { + append_utf(value); + return *this; + } + + private: + typename utf_back_insert_iterator::string_type& s; + }; + +} // namespace unicode::detail + +namespace unicode { + + // Encoding for convert() and UTF-* + template<typename InputIt, typename OutputIt> + struct UTF + { + typedef typename OutputIt::value_type value_type; + typedef typename InputIt::string_type string_type; + + static InputIt begin(const typename InputIt::string_type& s) + { + return InputIt{s.cbegin(), s.cend()}; + } + + static InputIt end(const typename InputIt::string_type& s) + { + return InputIt{s.cend(), s.cend()}; + } + + static OutputIt back_inserter(typename OutputIt::string_type& s) + { + return OutputIt(s); + } + }; + + // Encoding for convert() + typedef UTF<utf_iterator<utf8_t>, utf_back_insert_iterator<utf8_t>> UTF_8; + typedef UTF<utf_iterator<char16_t>, utf_back_insert_iterator<char16_t>> UTF_16; + typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32; + + // Helper to get correct Encoding from char type, e.g. Encoding<typename decltype(s)::value_type>::type or Encoding_t<typename decltype(s)::value_type> + template<typename T> + struct Encoding + { + }; + + template<> + struct Encoding<utf8_t> + { + typedef UTF_8 type; + }; + + template<> + struct Encoding<char16_t> + { + typedef UTF_16 type; + }; + + template<> + struct Encoding<char32_t> + { + typedef UTF_32 type; + }; + + template<typename T> + using Encoding_t = typename Encoding<T>::type; + +} // namespace unicode + |