// libunicode #pragma once #include #include #include #ifdef __cpp_char8_t // char8_t available typedef char8_t utf8_t; #else typedef char utf8_t; #endif namespace unicode { // usually, char32_t, uint32_t etc. template static inline bool is_valid_unicode(const T& value) { return value <= 0x10FFFF && (value <= 0xD7FF || value >= 0xE000); } } namespace unicode::detail { using namespace std::string_literals; template struct utf_iterator { typedef char32_t value_type; typedef char32_t& reference; typedef std::basic_string string_type; utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend): iterator(cbegin), end_iterator(cend) { calculate_value(); } utf_iterator(const utf_iterator& other) = default; utf_iterator& operator=(const utf_iterator& other) = default; size_t remaining_code_units() { return end_iterator - iterator; } template T get_code_unit() { return *(iterator + index); } // set value member // default: char32_t for UTF-32 // specializations for UTF-8 and UTF-16 below template void calculate_value() { static_assert(sizeof(T1) == 4); size_t remaining{remaining_code_units()}; if (!remaining) return; value = get_code_unit<0>(); if (!unicode::is_valid_unicode(value)) throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(value))); sequence_length = 1; } inline static bool is_continuation_byte(T b) { return (b & 0b11000000) == 0b10000000; } template inline static bool is_continuation_byte(T b, Targs... Fargs) { return is_continuation_byte(b) && is_continuation_byte(Fargs...); } template inline static bool is_byte0_of(T b) { return (b & static_cast(0xFF << (7 - n))) == static_cast(0xFF << (8 - n)); } inline static char32_t continuation_value(T b) { return static_cast(b & 0b00111111); } template inline static char32_t continuation_value(T b, Targs... Fargs) { return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...); } template inline static char32_t value_byte0_of(T b) { return static_cast(b & (0b1111111 >> n)) << ((n - 1) * 6); } // GCC Bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85282 // specialization for UTF-8 template<> void calculate_value() { size_t remaining{remaining_code_units()}; if (!remaining) return; utf8_t byte0 {get_code_unit<0>()}; if (byte0 & 0x80) { // 2-4 bytes if (remaining >= 2) { utf8_t byte1 {get_code_unit<1>()}; if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes value = value_byte0_of<2>(byte0) | continuation_value(byte1); sequence_length = 2; } else if (remaining >= 3) { utf8_t byte2 {get_code_unit<2>()}; if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2); sequence_length = 3; } else if (remaining >= 4) { utf8_t byte3 {get_code_unit<3>()}; if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3); sequence_length = 4; } else throw std::invalid_argument("Bad input: Invalid 4 byte sequence"); } else throw std::invalid_argument("Bad input: Invalid 3 byte sequence"); } else throw std::invalid_argument("Bad input: Invalid 2 byte sequence"); } else throw std::invalid_argument("Bad input: 2nd byte expected, none found"); // check only for sequences >= 2 bytes (ASCII is always compliant) if (!unicode::is_valid_unicode(value)) throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(value))); } else { // 1 byte: 7 bit ASCII value = byte0; sequence_length = 1; } } // specialization for UTF-16 template<> void calculate_value() { size_t remaining{remaining_code_units()}; if (!remaining) return; char16_t unit0 {get_code_unit<0>()}; if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane) value = unit0; sequence_length = 1; } else { if (remaining < 2) throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); char16_t unit1 {get_code_unit<1>()}; if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); value = (static_cast(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000; sequence_length = 2; } } // pre-increment utf_iterator& operator++() { iterator += sequence_length; calculate_value(); return *this; } bool operator!=(const utf_iterator& other) const { return iterator != other.iterator; } reference operator*() { return value; } typename string_type::const_iterator iterator; typename string_type::const_iterator end_iterator; char32_t value{}; // always save complete unicode code point at this point size_t sequence_length{}; }; template struct utf_back_insert_iterator { typedef std::basic_string string_type; typedef utf_back_insert_iterator& reference; utf_back_insert_iterator(string_type& s): s(s) {} // no-op utf_back_insert_iterator& operator++() { return *this; } // support *x = value, together with operator=() reference operator*() { return *this; } // default: utf-32 code unit for UTF-32 // specializations for UTF-8 and UTF-16 below template reference operator=(const char32_t& value) { // expect value to be already valid Unicode values s.push_back(value); return *this; } // n is number of UTF-8 bytes in sequence template inline static T byte0_of(char32_t value) { return (value >> 6 * (n - 1)) | (0xFF << (8 - n)); } // n is index of 6-bit groups, counting from bit 0 template inline static T trailing_byte(char32_t value) { return ((value >> n * 6) & 0b111111) | 0b10000000; } // calculate UTF-8 sequence byte for m >= 2 bytes sequences (i.e. non-ASCII) // assume value to be valid Unicode value for given byte position template inline static T byte_n_of_m(char32_t value) { if constexpr (n == 0) return byte0_of(value); else return trailing_byte(value); } // specialization for UTF-8 // append utf-8 byte sequence template<> reference operator=(const char32_t& value) { if (value < 0x80) { // 1 byte s.push_back(value); } else if (value < 0x800) { // 2 bytes s.push_back(byte_n_of_m<0,2>(value)); s.push_back(byte_n_of_m<1,2>(value)); } else if (value < 0x10000) { // 3 bytes s.push_back(byte_n_of_m<0,3>(value)); s.push_back(byte_n_of_m<1,3>(value)); s.push_back(byte_n_of_m<2,3>(value)); } else if (value < 0x110000) { // 4 bytes s.push_back(byte_n_of_m<0,4>(value)); s.push_back(byte_n_of_m<1,4>(value)); s.push_back(byte_n_of_m<2,4>(value)); s.push_back(byte_n_of_m<3,4>(value)); } else throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast(value))); return *this; } // specialization for UTF-16 // append utf-16 word sequence template<> reference operator=(const char32_t& value) { if (value <= 0xFFFF) { // expect value to be already valid Unicode values s.push_back(value); } else { char32_t value_reduced{value - 0x10000}; s.push_back((value_reduced >> 10) + 0xD800); s.push_back((value_reduced & 0x3FF) + 0xDC00); } return *this; } typename utf_back_insert_iterator::string_type& s; }; template utf_back_insert_iterator utf_back_inserter(std::basic_string& s) { return utf_back_insert_iterator(s); } template utf_iterator utf_begin(const std::basic_string& s) { return utf_iterator{s.cbegin(), s.cend()}; } template utf_iterator utf_end(const std::basic_string& s) { return utf_iterator{s.cend(), s.cend()}; } } // namespace namespace unicode { using namespace detail; template std::basic_string utf_to_utf(const std::basic_string& s) { std::basic_string result; std::copy(utf_begin(s), utf_end(s), utf_back_inserter(result)); return result; } template bool is_valid_utf(const std::basic_string& s) { try { std::for_each(utf_begin(s), utf_end(s), [](const T& c){}); } catch(...) { return false; } return true; } } // namespace unicode