// libunicode // // Author: Roland Reichwein // // Available under the conditions of CC0 1.0 Universal // https://creativecommons.org/publicdomain/zero/1.0/ #pragma once #include #include #include #include #include #include #include #include #include #ifdef __cpp_char8_t // char8_t available typedef char8_t utf8_t; #else typedef char utf8_t; #endif typedef char iso_t; namespace unicode { // usually, char32_t, uint32_t etc. template static inline bool is_valid_unicode(const T& value) noexcept { if constexpr(sizeof(T) == 1) return true; else if constexpr(sizeof(T) == 2) //return value <= 0xD7FF || value >= 0xE000; return (value & 0xF800) != 0xD800; else //return (value & 0xFFFFF800) != 0x0000D800 && (value >> 16) <= 0x10; return value <= 0xD7FF || (value >= 0xE000 && value <= 0x10FFFF); } } namespace unicode::detail { using namespace std::string_literals; template inline bool is_utf8_leading_byte(value_type byte) noexcept { static_assert(sequence_length <= 4); if constexpr(sequence_length == 1) { return !(byte & 0x80); } else { return (byte & static_cast(0xFF << (7 - sequence_length))) == static_cast(0xFF << (8 - sequence_length)); } } template inline bool is_utf8_followup_byte(value_type b) noexcept { return (b & 0b11000000) == 0b10000000; } template inline bool is_utf8_sequence(value_type byte0, Tbytes... bytes) noexcept { constexpr auto n{sizeof...(Tbytes) + 1}; static_assert(n <= 4, "UTF-8 sequences of 1 through 4 code units are supported"); return is_utf8_leading_byte(byte0) && (... && is_utf8_followup_byte(bytes)); // left fold for linear evaluation from left to right } template = true> inline bool validate_utf(const std::basic_string& s) { int i{}; auto size{s.size()}; while (i < size) { if (is_utf8_sequence(s[i])) { i++; } else if ((i < size - 1) && is_utf8_sequence(s[i], s[i + 1])) { i += 2; } else if ((i < size - 2) && is_utf8_sequence(s[i], s[i + 1], s[i + 2])) { if (((s[i] & 0xF) == 0xD) && ((s[i + 1] & 0x20) == 0x20)) return false; // Reserved for UTF-16 surrogates: 0xD800..0xDFFF i += 3; } else if ((i < size - 3) && is_utf8_sequence(s[i], s[i + 1], s[i + 2], s[i + 3])) { if ((((s[i] & 7) << 2) | ((s[i + 1] >> 4) & 3)) >= 0x11) return false; // Unicode too big above 0x10FFFF i += 4; } else { return false; } } return true; } template inline bool is_utf16_sequence(value_type word0, Twords... words) noexcept { constexpr auto n{sizeof...(Twords) + 1}; static_assert(n <= 2, "UTF-16 sequences of only 1 or 2 code units are supported"); if constexpr(n == 1) { return is_valid_unicode(word0); } else { char16_t unit0 {static_cast(word0)}; char16_t unit1 {static_cast((words, ...))}; return (unit0 & 0xFC00) == 0xD800 && (unit1 & 0xFC00) == 0xDC00; } } template = true> inline bool validate_utf(const std::basic_string& s) { int i{}; auto size{s.size()}; while (i < size) { if (is_utf16_sequence(s[i])) { i++; } else if ((i < size - 1) && is_utf16_sequence(s[i], s[i + 1])) { i += 2; } else { return false; } } return true; } template = true> inline bool validate_utf(const std::basic_string& s) { for (auto i: s) if (!is_valid_unicode(i)) return false; return true; } template inline char32_t continuation_value(value_type b) noexcept { return static_cast(b & 0b00111111); } template inline char32_t continuation_value(value_type b, Targs... Fargs) noexcept { return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...); } template inline char32_t value_byte0_of(value_type b) noexcept { return static_cast(b & (0b1111111 >> n)) << ((n - 1) * 6); } template> struct utf_iterator { static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); typedef T value_type; typedef char32_t internal_type; typedef char32_t& reference; typedef char32_t* pointer; typedef size_t difference_type; typedef std::input_iterator_tag iterator_category; typedef Container string_type; utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend): iterator(cbegin), end_iterator(cend) { } utf_iterator(const utf_iterator& other) = default; utf_iterator& operator=(const utf_iterator& other) = default; size_t remaining_code_units() const noexcept { return std::distance(iterator, end_iterator); } template value_type get_code_unit() const noexcept { if constexpr (std::is_same>::value) { // std::list doesn't support it + n auto it{iterator}; std::advance(it, index); return *it; } else { return *(iterator + index); } } template = true> inline internal_type calculate_value() { utf8_t byte0 {static_cast(get_code_unit<0>())}; if (is_utf8_sequence(byte0)) { // 1 byte: 7 bit ASCII std::advance(iterator, 1); return byte0; } else { internal_type value{}; if (size_t remaining{remaining_code_units()}; remaining >= 2) { utf8_t byte1 {static_cast(get_code_unit<1>())}; if (is_utf8_sequence(byte0, byte1)) { // 2 bytes value = value_byte0_of<2>(byte0) | continuation_value(byte1); std::advance(iterator, 2); } else if (remaining >= 3) { utf8_t byte2 {static_cast(get_code_unit<2>())}; if (is_utf8_sequence(byte0, byte1, byte2)) { // 3 bytes value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2); std::advance(iterator, 3); } else if (remaining >= 4) { utf8_t byte3 {static_cast(get_code_unit<3>())}; if (is_utf8_sequence(byte0, byte1, byte2, byte3)) { // 4 bytes value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3); std::advance(iterator, 4); } else throw std::invalid_argument("Bad input: Invalid 4 byte sequence"); } else throw std::invalid_argument("Bad input: Invalid 3 byte sequence"); } else throw std::invalid_argument("Bad input: Invalid 2 byte sequence"); } else throw std::invalid_argument("Bad input: 2nd byte expected, none found"); // check only for sequences >= 2 bytes (ASCII is always compliant) if (!unicode::is_valid_unicode(value)) throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(value))); return value; } } template = true> inline internal_type calculate_value() { char16_t unit0 {static_cast(get_code_unit<0>())}; if (is_valid_unicode(unit0)) { // 1 unit (BMP Basic Multilingual Plane) std::advance(iterator, 1); return unit0; } else { if (remaining_code_units() < 2) throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); char16_t unit1 {static_cast(get_code_unit<1>())}; if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); std::advance(iterator, 2); return (static_cast(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000; } } template = true> inline internal_type calculate_value() { internal_type result {static_cast(get_code_unit<0>())}; if (!unicode::is_valid_unicode(result)) throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(result))); std::advance(iterator, 1); return result; } // pre-increment utf_iterator& operator++() { return *this; } bool operator!=(const utf_iterator& other) const { return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator); } internal_type operator*() { return calculate_value(); } utf_iterator& operator+=(size_t distance) { std::advance(iterator, distance); return *this; } size_t operator-(const utf_iterator& other) const { return iterator - other.iterator; } private: typename string_type::const_iterator iterator; typename string_type::const_iterator end_iterator; }; template> struct utf_back_insert_iterator { static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); typedef T value_type; typedef char32_t internal_type; typedef Container string_type; typedef utf_back_insert_iterator& reference; typedef utf_back_insert_iterator* pointer; typedef size_t difference_type; typedef std::output_iterator_tag iterator_category; utf_back_insert_iterator(string_type& s): s(s) {} utf_back_insert_iterator& operator=(const utf_back_insert_iterator& other) { if (std::addressof(other.s) != std::addressof(s)) throw std::runtime_error("utf_back_insert_iterator assignment operator actually called! Iterator should not be assigned to."); return *this; } // no-op reference operator++() { return *this; } // support *x = value, together with operator=() reference operator*() { return *this; } // n is number of UTF-8 bytes in sequence template inline static value_type byte0_of(internal_type value) { return (value >> 6 * (n - 1)) | (0xFF << (8 - n)); } // n is index of 6-bit groups, counting from bit 0 template inline static value_type trailing_byte(internal_type value) { return ((value >> n * 6) & 0b111111) | 0b10000000; } // calculate UTF-8 sequence byte for m >= 2 bytes sequences (i.e. non-ASCII) // assume value to be valid Unicode value for given byte position template inline static value_type byte_n_of_m(internal_type value) { if constexpr (n == 0) return byte0_of(value); else return trailing_byte(value); } template inline void append(Arg&& arg) { if constexpr (std::is_same>::value) { s.append({arg}); } else { s.emplace_back(arg); } } template inline void append(Arg&& arg, Args&&... args) { if constexpr (std::is_same>::value) { s.append({arg, args...}); } else { s.emplace_back(arg); append(args...); } } template = true> inline void append_utf(const internal_type& value) { if (value < 0x80) { // 1 byte append(static_cast(value)); } else if (value < 0x800) { // 2 bytes append(byte_n_of_m<0,2>(value), byte_n_of_m<1,2>(value)); } else if (value < 0x10000) { // 3 bytes append(byte_n_of_m<0,3>(value), byte_n_of_m<1,3>(value), byte_n_of_m<2,3>(value)); } else if (value < 0x110000) { // 4 bytes append(byte_n_of_m<0,4>(value), byte_n_of_m<1,4>(value), byte_n_of_m<2,4>(value), byte_n_of_m<3,4>(value)); } else throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast(value))); } template = true> inline void append_utf(const internal_type& value) { if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator) append(static_cast(value)); } else { internal_type value_reduced{value - 0x10000}; append(static_cast((value_reduced >> 10) + 0xD800), static_cast((value_reduced & 0x3FF) + 0xDC00)); } } template = true> inline void append_utf(const internal_type& value) { // expect value to be already valid Unicode values (checked in input iterator) append(static_cast(value)); } reference operator=(const internal_type& value) { append_utf(value); return *this; } private: typename utf_back_insert_iterator::string_type& s; }; typedef std::unordered_map iso_map_type; typedef std::unordered_map iso_map_type_reverse; // ISO-8859-1 is lower 8-bit of Unicode, so no exceptions necessary static inline iso_map_type iso_8859_1_map; // ISO-8859-15 is lower 8-bit of Unicode, except for: static inline iso_map_type iso_8859_15_map { { '\xA4', U'\u20AC' }, // € { '\xA6', U'\u0160' }, // Š { '\xA8', U'\u0161' }, // š { '\xB4', U'\u017D' }, // Ž { '\xB8', U'\u017E' }, // ž { '\xBC', U'\u0152' }, // Œ { '\xBD', U'\u0153' }, // œ { '\xBE', U'\u0178' }, // Ÿ }; inline iso_map_type_reverse reverse_iso_map(const iso_map_type& map) { iso_map_type_reverse result; std::for_each(map.cbegin(), map.cend(), [&](const iso_map_type::value_type& pair) { result.emplace(pair.second, pair.first); }); return result; } static inline iso_map_type_reverse iso_8859_15_map_reverse { reverse_iso_map(iso_8859_15_map) }; static inline iso_map_type_reverse iso_8859_1_map_reverse { reverse_iso_map(iso_8859_1_map) }; } // namespace unicode::detail namespace unicode { using namespace detail; template> struct iso_iterator { typedef iso_t value_type; typedef char32_t internal_type; typedef char32_t& reference; typedef char32_t* pointer; typedef size_t difference_type; typedef std::input_iterator_tag iterator_category; typedef typename Container::const_iterator iterator; typedef Container string_type; iso_iterator(const iterator& it): m_it(it) {} // pre-increment iso_iterator& operator++() { ++m_it; return *this; } bool operator!=(const iso_iterator& other) const { return m_it != other.m_it; } // return reference? internal_type operator*() const { value_type value{*m_it}; if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 needed { auto it{Map.find(value)}; if (it != Map.end()) return it->second; } return static_cast(static_cast(value)); } iso_iterator& operator+=(size_t distance) { std::advance(m_it, distance); return *this; } difference_type operator-(const iso_iterator& other) const { return m_it - other.m_it; } private: iterator m_it; }; template> struct iso_back_insert_iterator { typedef iso_back_insert_iterator& reference; typedef iso_back_insert_iterator* pointer; typedef size_t difference_type; typedef iso_t value_type; typedef char32_t internal_type; typedef std::output_iterator_tag iterator_category; typedef Container string_type; iso_back_insert_iterator(string_type& s): s(s) {} iso_back_insert_iterator& operator=(const iso_back_insert_iterator& other) { if (std::addressof(other.s) != std::addressof(s)) throw std::runtime_error("iso_back_insert_iterator assignment operator actually called! Iterator should not be assigned to."); return *this; } // no-op reference operator++() { return *this; } // support *x = value, together with operator=() reference operator*() { return *this; } reference operator=(const internal_type& value) { if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map_reverse)) // mapping of 128 <= x <= 255 needed { auto it{Map.find(value)}; if (it != Map.end()) { s.push_back(it->second); return *this; } } if (value > 255) throw std::invalid_argument("Bad ISO 8859 value above 255: "s + std::to_string(static_cast(value))); s.push_back(static_cast(value)); return *this; } private: typename iso_back_insert_iterator::string_type& s; }; // Encoding for convert() and ISO-8859-* template struct ISO_8859 { typedef iso_t value_type; typedef typename InputIt::string_type string_type; static InputIt begin(const typename InputIt::string_type& s) { return InputIt(s.cbegin()); } static InputIt end(const typename InputIt::string_type& s) { return InputIt(s.cend()); } static OutputIt back_inserter(typename OutputIt::string_type& s) { return OutputIt(s); } }; // Encoding for convert() and UTF-* template struct UTF { typedef typename OutputIt::value_type value_type; typedef typename InputIt::string_type string_type; static InputIt begin(const typename InputIt::string_type& s) { return InputIt{s.cbegin(), s.cend()}; } static InputIt end(const typename InputIt::string_type& s) { return InputIt{s.cend(), s.cend()}; } static OutputIt back_inserter(typename OutputIt::string_type& s) { return OutputIt(s); } }; // Encoding for convert() typedef ISO_8859, iso_back_insert_iterator<>> ISO_8859_1; typedef ISO_8859, iso_back_insert_iterator> ISO_8859_15; typedef UTF, utf_back_insert_iterator> UTF_8; typedef UTF, utf_back_insert_iterator> UTF_16; typedef UTF, utf_back_insert_iterator> UTF_32; // std::distance doesn't work here: it is based on "output" distance of iterators template inline size_t input_distance(const Iterator& it1, const Iterator& it2) { return it2 - it1; } template inline size_t input_distance_bytes(const Iterator& it1, const Iterator& it2) { return input_distance(it1, it2) * sizeof(typename Iterator::value_type); } // Optimizations following: static const size_t accu_size {sizeof(size_t)}; template struct ConvertInputOptimizer {}; template<> struct ConvertInputOptimizer<1> { static const uint32_t ascii_mask { 0x80808080 }; }; template<> struct ConvertInputOptimizer<2> { static const uint32_t ascii_mask { 0xFF80FF80 }; }; template<> struct ConvertInputOptimizer<4> { static const uint32_t ascii_mask { 0xFFFFFF80 }; }; template struct ArchitectureOptimizer {}; template struct ArchitectureOptimizer<4, ConvertInputOptimizer> { typedef ConvertInputOptimizer input_optimizer; typedef uint32_t accu_type; static const accu_type addr_mask {accu_size - 1}; static const accu_type ascii_mask { (accu_type)input_optimizer::ascii_mask }; static const accu_type ascii_value { 0ULL }; template inline static void append(const input_value_type* addr, output_string_type& s) { if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { s.append(reinterpret_cast(addr), accu_size / sizeof(input_value_type)); } else if constexpr(sizeof(input_value_type) == 1) { s.append({static_cast(addr[0]), static_cast(addr[1]), static_cast(addr[2]), static_cast(addr[3])}); } else if constexpr(sizeof(input_value_type) == 2) { s.append({static_cast(addr[0]), static_cast(addr[1])}); } else if constexpr(sizeof(input_value_type) == 4) { s.append({static_cast(addr[0])}); } } }; template struct ArchitectureOptimizer<8, ConvertInputOptimizer> { typedef ConvertInputOptimizer input_optimizer; typedef uint64_t accu_type; static const accu_type addr_mask {accu_size - 1}; static const accu_type ascii_mask { ((accu_type)input_optimizer::ascii_mask) << 32 | (accu_type)input_optimizer::ascii_mask }; static const accu_type ascii_value { 0ULL }; template inline static void append(const input_value_type* addr, output_string_type& s) { if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { s.append(reinterpret_cast(addr), accu_size / sizeof(input_value_type)); } else if constexpr(sizeof(input_value_type) == 1) { s.append({static_cast(addr[0]), static_cast(addr[1]), static_cast(addr[2]), static_cast(addr[3]), static_cast(addr[4]), static_cast(addr[5]), static_cast(addr[6]), static_cast(addr[7])}); } else if constexpr(sizeof(input_value_type) == 2) { s.append({static_cast(addr[0]), static_cast(addr[1]), static_cast(addr[2]), static_cast(addr[3])}); } else if constexpr(sizeof(input_value_type) == 4) { s.append({static_cast(addr[0]), static_cast(addr[1])}); } } }; // class ArchitectureOptimizer // From and To are Encodings template::value, bool> = true> typename To::string_type convert_optimized(const typename From::string_type& s) { typename To::string_type result; typedef ConvertInputOptimizer input_optimizer; typedef ArchitectureOptimizer arch_optimizer; auto begin{From::begin(s)}; auto end{From::end(s)}; auto back_inserter{To::back_inserter(result)}; auto addr{reinterpret_cast(&s.data()[s.size() - input_distance(begin, end)])}; while (input_distance_bytes(begin, end) >= accu_size) { if (((uintptr_t)(void*)addr & arch_optimizer::addr_mask) == 0) { while (input_distance_bytes(begin, end) >= accu_size) { typename arch_optimizer::accu_type data{*addr}; if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) { arch_optimizer::template append(reinterpret_cast(addr), result); begin += accu_size / sizeof(typename From::value_type); ++addr; } else { // just advance one code unit for now and break to trigger unoptimized // version until next accu boundary back_inserter = *begin; ++begin; break; } } } // keep up after unaligned Non-ASCII code points while (begin != end && (uintptr_t)(void*)(addr = reinterpret_cast(&s.data()[s.size() - input_distance(begin, end)])) & arch_optimizer::addr_mask) { back_inserter = *begin; ++begin; } } // remainder < 8 bytes while (begin != end) { back_inserter = *begin; ++begin; } return result; } // From and To are Encodings template::value, bool> = true> typename To::string_type convert(const typename From::string_type& s) { // if input type == output type, only validate and return input, is appropriate if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) == 1 && std::is_same_v, utf_back_insert_iterator>> && std::is_same_v, utf_back_insert_iterator>>) { if (validate_utf(s)) { return s; } else { throw std::invalid_argument("Invalid UTF-8"); } } if constexpr(accu_size == 4 || accu_size == 8) { return convert_optimized(s); } else { typename To::string_type result; std::copy(From::begin(s), From::end(s), To::back_inserter(result)); return result; } } // Helper to get correct Encoding from char type, e.g. Encoding::type or Encoding_t template struct Encoding { }; template<> struct Encoding { typedef UTF_8 type; }; template<> struct Encoding { typedef UTF_16 type; }; template<> struct Encoding { typedef UTF_32 type; }; template using Encoding_t = typename Encoding::type; // From and To are from: utf8_t (i.e. char or char8_t (C++20)), char16_t and char32_t, char, wchar_t, uint8_t, uint16_t, uint32_t template, typename ToContainer=std::basic_string, std::enable_if_t::value && std::is_scalar::value && !std::is_empty::value, bool> = true> ToContainer convert(const FromContainer& s) { typedef UTF, utf_back_insert_iterator> UTF_Trait; ToContainer result; std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); return result; } // From and To are containers template::value && !std::is_empty::value, bool> = true > ToContainer convert(const FromContainer& s) { typedef UTF, utf_back_insert_iterator> UTF_Trait; ToContainer result; std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); return result; } // Container version template::value, bool> = true> bool is_valid_utf(const Container& s) { typedef UTF, utf_back_insert_iterator> UTF_Trait; try { std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); } catch (const std::invalid_argument&) { return false; } return true; } // basic type version template, std::enable_if_t::value && !std::is_empty::value, bool> = true> bool is_valid_utf(const Container& s) { typedef UTF, utf_back_insert_iterator> UTF_Trait; try { std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); } catch (const std::invalid_argument&) { return false; } return true; } // Encoding version template::value, bool> = true> bool is_valid_utf(const typename Encoding::string_type& s) { return validate_utf(s); } } // namespace unicode