// libunicode // Copyright (C) 2021 Roland Reichwein #pragma once #include #include #include #ifdef __has_cpp_attribute #if __has_cpp_attribute(__cpp_char8_t) // char8_t available #endif #endif namespace { struct utf8_iterator { typedef char32_t value_type; typedef char32_t& reference; utf8_iterator(const std::u8string::const_iterator& cbegin, const std::u8string::const_iterator& cend): iterator(cbegin), end_iterator(cend) { calculate_value(); } utf8_iterator(const utf8_iterator& other) = default; utf8_iterator& operator=(const utf8_iterator& other) = default; // set value member void calculate_value() { if (iterator == end_iterator) return; char8_t first_byte {*iterator}; if (first_byte & 0x80) { // 2-4 bytes if (iterator + 1 != end_iterator) { char8_t second_byte {*(iterator + 1)}; if ((first_byte & 0b11100000) == 0b11000000 && (second_byte & 0b11000000) == 0b10000000) { // 2 bytes value = char32_t(first_byte & 0b11111) << 6 | (second_byte & 0b111111); sequence_length = 2; } else if (iterator + 2 != end_iterator) { char8_t third_byte {*(iterator + 2)}; if ((first_byte & 0b11110000) == 0b11100000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000) { // 3 bytes value = char32_t(first_byte & 0b1111) << 12 | char32_t(second_byte & 0b111111) << 6 | (third_byte & 0b111111); sequence_length = 3; } else if (iterator + 3 != end_iterator) { char8_t fourth_byte {*(iterator + 3)}; if ((first_byte & 0b11111000) == 0b11110000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000 && (fourth_byte & 0b11000000) == 0b10000000) { // 4 bytes value = char32_t(first_byte & 0b111) << 18 | char32_t(second_byte & 0b111111) << 12 | char32_t(third_byte & 0b111111) << 6 | (fourth_byte & 0b111111); sequence_length = 4; } else throw std::invalid_argument("bad input: invalid 4 byte sequence"); } else throw std::invalid_argument("bad input: invalid 3 byte sequence"); } else throw std::invalid_argument("bad input: invalid 2 byte sequence"); } else throw std::invalid_argument("bad input: byte 2 expected, none found"); } else { // 1 byte: 7 bit ASCII value = first_byte; sequence_length = 1; } } // pre-increment utf8_iterator& operator++() { iterator += sequence_length; calculate_value(); return *this; } bool operator!=(const utf8_iterator& other) const { return iterator != other.iterator; } reference operator*() { return value; } std::u8string::const_iterator iterator; std::u8string::const_iterator end_iterator; value_type value{}; size_t sequence_length{}; }; struct utf16_back_insert_iterator { typedef utf16_back_insert_iterator& reference; utf16_back_insert_iterator(std::u16string& s): s(s) {} // no-op utf16_back_insert_iterator& operator++() { return *this; } // support *x = value, together with operator=() reference operator*() { return *this; } // append utf-16 word sequence reference operator=(const char32_t& value) { if (value <= 0xFFFF) { // expect value to be already valid Unicode values, TODO: validate char32_t! s.push_back(value); } else { s.push_back((value >> 10) + 0xD800); s.push_back((value & 0x3FF) + 0xDC00); } return *this; } std::u16string& s; }; utf16_back_insert_iterator utf16_back_inserter(std::u16string& s) { return utf16_back_insert_iterator(s); } utf8_iterator utf8_begin(const std::u8string& s) { return utf8_iterator{s.cbegin(), s.cend()}; } utf8_iterator utf8_end(const std::u8string& s) { return utf8_iterator{s.cend(), s.cend()}; } } // namespace namespace unicode { std::u16string utf8_to_utf16(const std::u8string& s) { std::u16string result; std::copy(utf8_begin(s), utf8_end(s), utf16_back_inserter(result)); return result; } //std::u8string utf16_to_utf8(const std::u16string& s) //{ // std::u8string result; // // std::transform(utf16_begin(s), utf16_end(s), std::back_inserter(result)); // // return result; //} } // namespace unicode