diff options
Diffstat (limited to 'include')
| -rw-r--r-- | include/unicode.h | 869 | ||||
| -rw-r--r-- | include/unicode/endian.h | 26 | ||||
| -rw-r--r-- | include/unicode/iso.h | 189 | ||||
| -rw-r--r-- | include/unicode/predicate.h | 21 | ||||
| -rw-r--r-- | include/unicode/type_traits.h | 77 | ||||
| -rw-r--r-- | include/unicode/types.h | 10 | ||||
| -rw-r--r-- | include/unicode/utf.h | 448 | 
7 files changed, 994 insertions, 646 deletions
| diff --git a/include/unicode.h b/include/unicode.h index 1190292..a50f525 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -7,625 +7,25 @@  #pragma once +#include "unicode/endian.h" +#include "unicode/iso.h" +#include "unicode/predicate.h" +#include "unicode/types.h" +#include "unicode/type_traits.h" +#include "unicode/utf.h" +  #include <algorithm> +#include <array>  #include <cstdint>  #include <iterator> -#include <list>  #include <memory>  #include <stdexcept>  #include <string>  #include <type_traits> -#include <unordered_map> - -#ifdef __cpp_char8_t -// char8_t available - typedef char8_t utf8_t; -#else - typedef char utf8_t; -#endif -typedef char iso_t; +#include <utility>  namespace unicode { - // bits_to_compare: limit bits to consider even further than defined by T - // T: usually, char32_t, uint32_t etc. - template<size_t bits_to_compare = 32, typename T> - static inline bool is_valid_unicode(const T& value) noexcept - { -  if constexpr(sizeof(T) == 1 || bits_to_compare <= 15) -   return true; -  else if constexpr(sizeof(T) == 2 || bits_to_compare <= 20) -   //return value <= 0xD7FF || value >= 0xE000; -   return (value & 0xF800) != 0xD800; -  else -   //return (value & 0xFFFFF800) != 0x0000D800 && (value >> 16) <= 0x10; -   return value <= 0xD7FF || (value >= 0xE000 && value <= 0x10FFFF); - } - -} - -namespace unicode::detail { - - using namespace std::string_literals; - - template<size_t sequence_length, typename value_type> - inline bool is_utf8_leading_byte(value_type byte) noexcept - { -  static_assert(sequence_length <= 4); - -  if constexpr(sequence_length == 1) { -   return !(byte & 0x80); -  } else { -   return (byte & static_cast<value_type>(0xFF << (7 - sequence_length))) == static_cast<value_type>(0xFF << (8 - sequence_length)); -  } - } - - template<typename value_type> - inline bool is_utf8_followup_byte(value_type b) noexcept - { -  return (b & 0b11000000) == 0b10000000; - } - - template<typename value_type, typename... Tbytes> - inline bool is_utf8_sequence(value_type byte0, Tbytes... bytes) noexcept - { -  constexpr auto sequence_length{sizeof...(Tbytes) + 1}; - -  static_assert(sequence_length <= 4, "UTF-8 sequences of 1 through 4 code units are supported"); - -  return is_utf8_leading_byte<sequence_length>(byte0) && -         (... && is_utf8_followup_byte(bytes)); // left fold for linear evaluation from left to right - } - - template<typename T, typename std::enable_if_t<(sizeof(T) == 1), bool> = true> - inline bool validate_utf(const std::basic_string<T>& s) - { -  int i{}; -  auto size{s.size()}; -  while (i < size) { -   if (is_utf8_sequence(s[i])) { -    i++; -   } else if ((i < size - 1) && is_utf8_sequence(s[i], s[i + 1])) { -    i += 2; -   } else if ((i < size - 2) && is_utf8_sequence(s[i], s[i + 1], s[i + 2])) { -    if (((s[i] & 0xF) == 0xD) && ((s[i + 1] & 0x20) == 0x20)) -     return false; // Reserved for UTF-16 surrogates: 0xD800..0xDFFF -    i += 3; -   } else if ((i < size - 3) && is_utf8_sequence(s[i], s[i + 1], s[i + 2], s[i + 3])) { -    if ((((s[i] & 7) << 2) | ((s[i + 1] >> 4) & 3)) >= 0x11) -     return false; // Unicode too big above 0x10FFFF -    i += 4; -   } else { -    return false; -   } -  } -  return true; - } - - template<typename value_type, typename... Twords> - inline bool is_utf16_sequence(value_type word0, Twords... words) noexcept - { -  constexpr auto sequence_length{sizeof...(Twords) + 1}; - -  static_assert(sequence_length <= 2, "UTF-16 sequences of only 1 or 2 code units are supported"); - -  if constexpr(sequence_length == 1) { -   return is_valid_unicode(word0); -  } else { -   char16_t unit0 {static_cast<char16_t>(word0)}; -   char16_t unit1 {static_cast<char16_t>((words, ...))}; -   return (unit0 & 0xFC00) == 0xD800 && (unit1 & 0xFC00) == 0xDC00; -  } - } - - template<typename T, typename std::enable_if_t<(sizeof(T) == 2), bool> = true> - inline bool validate_utf(const std::basic_string<T>& s) - { -  int i{}; -  auto size{s.size()}; -  while (i < size) { -   if (is_utf16_sequence(s[i])) { -    i++; -   } else if ((i < size - 1) && is_utf16_sequence(s[i], s[i + 1])) { -    i += 2; -   } else { -    return false; -   } -  } -  return true; - } - - template<typename T, typename std::enable_if_t<(sizeof(T) == 4), bool> = true> - inline bool validate_utf(const std::basic_string<T>& s) - { -  for (auto i: s) -   if (!is_valid_unicode(i)) -    return false; -  return true; - } - - template<size_t sequence_length, typename value_type> - inline char32_t decode_utf8_leading_byte(value_type b) noexcept - { -  return static_cast<char32_t>(b & (0b1111111 >> sequence_length)) << ((sequence_length - 1) * 6); - } - - template<typename value_type> - inline char32_t decode_utf8_followup_byte(value_type b) noexcept - { -  return static_cast<char32_t>(b & 0b00111111); - } - - template<typename value_type, typename... Targs> - inline char32_t decode_utf8_followup_byte(value_type b, Targs... bytes) noexcept - { -  return decode_utf8_followup_byte(b) << (6 * sizeof...(Targs)) | decode_utf8_followup_byte(bytes...); - } - - template<typename value_type, typename... Targs> - inline char32_t decode_utf8_sequence(value_type b, Targs... bytes) noexcept - { -  size_t constexpr sequence_length{sizeof...(Targs) + 1}; - -  static_assert(sequence_length <= 4); - -  if constexpr (sequence_length == 1) -   return b; -  else -   return decode_utf8_leading_byte<sequence_length>(b) | decode_utf8_followup_byte(bytes...); - } - - template<typename T, typename Container=std::basic_string<T>> - struct utf_iterator - { -  static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); - -  typedef T value_type; -  typedef char32_t internal_type; -  typedef char32_t& reference; -  typedef char32_t* pointer; -  typedef size_t difference_type; -  typedef std::input_iterator_tag iterator_category; -  typedef Container string_type; - -  utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend): -   iterator(cbegin), end_iterator(cend) -  { -  } - -  utf_iterator(const utf_iterator& other) = default; -  utf_iterator& operator=(const utf_iterator& other) = default; - -  inline size_t remaining_code_units() const noexcept -  { -   return std::distance(iterator, end_iterator); -  } - -  template<size_t index> -  inline value_type get_code_unit() const noexcept -  { -   if constexpr (std::is_same_v<Container, typename std::list<value_type>>) { -    // std::list doesn't support it + n -    auto it{iterator}; -    std::advance(it, index); -    return *it; -   } else { -    return *(iterator + index); -   } -  } - -  template<typename... Tbytes> -  inline internal_type calculate_utf8_value(Tbytes... bytes) -  { -   size_t constexpr sequence_length{sizeof...(Tbytes)}; -   static_assert(sequence_length >= 1 && sequence_length <= 4); - -   if constexpr(sequence_length > 1) { -    if (remaining_code_units() < sequence_length) -     throw std::invalid_argument("Bad input: Not enough bytes left for decoding UTF-8 sequence"); -   } - -   if (is_utf8_sequence(bytes...)) { -    std::advance(iterator, sequence_length); -    internal_type result{decode_utf8_sequence(bytes...)}; -    if (!unicode::is_valid_unicode<sequence_length * 6>(result)) -     throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result))); -    return result; -   } else { -    if constexpr(sequence_length <= 3) // template recursion break condition: UTF-8 has 1..4 code units -     return calculate_utf8_value(bytes..., static_cast<utf8_t>(get_code_unit<sequence_length>())); -    else -     throw std::invalid_argument("Bad UTF-8 input: Invalid 4 byte sequence"); -   } -  } - -  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true> -  inline internal_type calculate_value() -  { -   return calculate_utf8_value(static_cast<utf8_t>(get_code_unit<0>())); -  } - -  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true> -  inline internal_type calculate_value() -  { -   char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())}; - -   if (is_valid_unicode(unit0)) { // 1 unit (BMP Basic Multilingual Plane) -    std::advance(iterator, 1); -    return unit0; -   } else { -    if (remaining_code_units() < 2) -     throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); - -    char16_t unit1 {static_cast<char16_t>(get_code_unit<1>())}; -    if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) -     throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); - -    std::advance(iterator, 2); -    return (static_cast<internal_type>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000; -   } -  } - -  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true> -  inline internal_type calculate_value() -  { -   internal_type result {static_cast<internal_type>(get_code_unit<0>())}; - -   if (!unicode::is_valid_unicode(result)) -    throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result))); - -   std::advance(iterator, 1); - -   return result; -  } - -  // pre-increment -  utf_iterator& operator++() -  { -   return *this; -  } - -  bool operator!=(const utf_iterator& other) const -  { -   return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator); -  } - -  internal_type operator*() -  { -   return calculate_value(); -  } - -  utf_iterator& operator+=(size_t distance) -  { -   std::advance(iterator, distance); -   return *this; -  } - -  size_t operator-(const utf_iterator& other) const -  { -   return iterator - other.iterator; -  } - - private: -  typename string_type::const_iterator iterator; -  typename string_type::const_iterator end_iterator; - }; - - template<typename T, typename Container=std::basic_string<T>> - struct utf_back_insert_iterator - { -  static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); - -  typedef T value_type; -  typedef char32_t internal_type; -  typedef Container string_type; -  typedef utf_back_insert_iterator& reference; -  typedef utf_back_insert_iterator* pointer; -  typedef size_t difference_type; -  typedef std::output_iterator_tag iterator_category; - -  utf_back_insert_iterator(string_type& s): s(s) {} - -  utf_back_insert_iterator& operator=(const utf_back_insert_iterator& other) -  { -   if (std::addressof(other.s) != std::addressof(s)) -    throw std::runtime_error("utf_back_insert_iterator assignment operator actually called! Iterator should not be assigned to."); - -   return *this; -  } - -  // no-op -  reference operator++() -  { -   return *this; -  } - -  // support *x = value, together with operator=() -  reference operator*() -  { -   return *this; -  } - -  // n is number of UTF-8 bytes in sequence -  template<size_t n> -  inline static value_type byte0_of(internal_type value) -  { -   return (value >> 6 * (n - 1)) | (0xFF << (8 - n)); -  } - -  // n is index of 6-bit groups, counting from bit 0 -  template<size_t n> -  inline static value_type trailing_byte(internal_type value) -  { -   return ((value >> n * 6) & 0b111111) | 0b10000000; -  } - -  // calculate UTF-8 sequence byte for m >= 2 bytes sequences (i.e. non-ASCII) -  // assume value to be valid Unicode value for given byte position -  template<size_t n, size_t m> -  inline static value_type byte_n_of_m(internal_type value) -  { -   if constexpr (n == 0) -    return byte0_of<m>(value); -   else -    return trailing_byte<m - n - 1>(value); -  } - -  template<typename... Args> -  inline void append(Args&&... args) -  { -   if constexpr (std::is_same_v<Container, typename std::basic_string<value_type>>) { -    s.append({args...}); -   } else { -    (s.emplace_back(args), ...); -   } -  } - -  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true> -  inline void append_utf(const internal_type& value) -  { -   if (value < 0x80) { // 1 byte -    append(static_cast<value_type>(value)); -   } else if (value < 0x800) { // 2 bytes -    append(byte_n_of_m<0,2>(value), byte_n_of_m<1,2>(value)); -   } else if (value < 0x10000) { // 3 bytes -    append(byte_n_of_m<0,3>(value), byte_n_of_m<1,3>(value), byte_n_of_m<2,3>(value)); -   } else { // 4 bytes -    // expect value to be already valid Unicode values (checked in input iterator) -    append(byte_n_of_m<0,4>(value), byte_n_of_m<1,4>(value), byte_n_of_m<2,4>(value), byte_n_of_m<3,4>(value)); -   } -  } - -  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true> -  inline void append_utf(const internal_type& value) -  { -   if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator) -    append(static_cast<value_type>(value)); -   } else { -    internal_type value_reduced{value - 0x10000}; -    append(static_cast<value_type>((value_reduced >> 10) + 0xD800), static_cast<value_type>((value_reduced & 0x3FF) + 0xDC00)); -   } -  } - -  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true> -  inline void append_utf(const internal_type& value) -  { -   // expect value to be already valid Unicode values (checked in input iterator) -   append(static_cast<value_type>(value)); -  } - -  reference operator=(const internal_type& value) -  { -   append_utf(value); -   return *this; -  } - - private: -  typename utf_back_insert_iterator::string_type& s; - }; - - typedef std::unordered_map<iso_t, char32_t> iso_map_type; - typedef std::unordered_map<char32_t, iso_t> iso_map_type_reverse; - - // ISO-8859-1 is lower 8-bit of Unicode, so no exceptions necessary - static inline iso_map_type iso_8859_1_map; - - // ISO-8859-15 is lower 8-bit of Unicode, except for: - static inline iso_map_type iso_8859_15_map { -  { '\xA4', U'\u20AC' }, // € -  { '\xA6', U'\u0160' }, // Š -  { '\xA8', U'\u0161' }, // š -  { '\xB4', U'\u017D' }, // Ž -  { '\xB8', U'\u017E' }, // ž -  { '\xBC', U'\u0152' }, // Œ -  { '\xBD', U'\u0153' }, // œ -  { '\xBE', U'\u0178' }, // Ÿ - }; - - inline iso_map_type_reverse reverse_iso_map(const iso_map_type& map) { -  iso_map_type_reverse result; -  std::for_each(map.cbegin(), map.cend(), -                [&](const iso_map_type::value_type& pair) -                 { -                  result.emplace(pair.second, pair.first); -                  result.emplace(static_cast<char32_t>(static_cast<uint8_t>(pair.first)), 0); // map invalid characters to a known non-mapped value as marker -                 }); -  return result; - } - - static inline iso_map_type_reverse iso_8859_15_map_reverse { reverse_iso_map(iso_8859_15_map) }; - static inline iso_map_type_reverse iso_8859_1_map_reverse { reverse_iso_map(iso_8859_1_map) }; - -} // namespace unicode::detail - -namespace unicode { - - using namespace detail; - - template<unicode::detail::iso_map_type& Map=iso_8859_1_map, typename Container=std::basic_string<iso_t>> - struct iso_iterator { -  typedef iso_t value_type; -  typedef char32_t internal_type; -  typedef char32_t& reference; -  typedef char32_t* pointer; -  typedef size_t difference_type; -  typedef std::input_iterator_tag iterator_category; -  typedef typename Container::const_iterator iterator; -  typedef Container string_type; - -  iso_iterator(const iterator& it): m_it(it) {} - -  // pre-increment -  iso_iterator& operator++() -  { -   ++m_it; -   return *this; -  } - -  bool operator!=(const iso_iterator& other) const -  { -   return m_it != other.m_it; -  } - -  // return reference? -  internal_type operator*() const -  { -   value_type value{*m_it}; - -   if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 if needed -   { -    auto it{Map.find(value)}; -    if (it != Map.end()) -     return it->second; -   } -   return static_cast<internal_type>(static_cast<uint8_t>(value)); -  } - -  iso_iterator& operator+=(size_t distance) -  { -   std::advance(m_it, distance); -   return *this; -  } - -  difference_type operator-(const iso_iterator& other) const -  { -   return m_it - other.m_it; -  } - - private: -  iterator m_it; - }; - - template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse, typename Container=std::basic_string<iso_t>> - struct iso_back_insert_iterator { -  typedef iso_back_insert_iterator& reference; -  typedef iso_back_insert_iterator* pointer; -  typedef size_t difference_type; -  typedef iso_t value_type; -  typedef char32_t internal_type; -  typedef std::output_iterator_tag iterator_category; -  typedef Container string_type; -   -  iso_back_insert_iterator(string_type& s): s(s) {} - -  iso_back_insert_iterator& operator=(const iso_back_insert_iterator& other) -  { -   if (std::addressof(other.s) != std::addressof(s)) -    throw std::runtime_error("iso_back_insert_iterator assignment operator actually called! Iterator should not be assigned to."); - -   return *this; -  } - -  // no-op -  reference operator++() -  { -   return *this; -  } - -  // support *x = value, together with operator=() -  reference operator*() -  { -   return *this; -  } - -  reference operator=(const internal_type& value) -  { -   if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map_reverse)) // mapping back to 128 <= x <= 255 if needed -   { -    auto it{Map.find(value)}; -    if (it != Map.end()) { -     if (it->second == 0) // marker for non-mappable character found -      throw std::invalid_argument("Bad Unicode value to map to ISO 8859-15: "s + std::to_string(static_cast<uint32_t>(value))); -     s.push_back(it->second); -     return *this; -    } -   } - -   if (value > 255) -    throw std::invalid_argument("Bad ISO 8859 value above 255: "s + std::to_string(static_cast<uint32_t>(value))); - -   s.push_back(static_cast<typename iso_back_insert_iterator::value_type>(value)); -   return *this; -  } - - private: -  typename iso_back_insert_iterator::string_type& s; - }; - - // Encoding for convert() and ISO-8859-* - template<typename InputIt, typename OutputIt> - struct ISO_8859 - { -  typedef iso_t value_type; -  typedef typename InputIt::string_type string_type; - -  static InputIt begin(const typename InputIt::string_type& s) -  { -   return InputIt(s.cbegin()); -  } - -  static InputIt end(const typename InputIt::string_type& s) -  { -   return InputIt(s.cend()); -  } - -  static OutputIt back_inserter(typename OutputIt::string_type& s) -  { -   return OutputIt(s); -  } - }; - - // Encoding for convert() and UTF-* - template<typename InputIt, typename OutputIt> - struct UTF - { -  typedef typename OutputIt::value_type value_type; -  typedef typename InputIt::string_type string_type; - -  static InputIt begin(const typename InputIt::string_type& s) -  { -   return InputIt{s.cbegin(), s.cend()}; -  } - -  static InputIt end(const typename InputIt::string_type& s) -  { -   return InputIt{s.cend(), s.cend()}; -  } - -  static OutputIt back_inserter(typename OutputIt::string_type& s) -  { -   return OutputIt(s); -  } - }; - - // Encoding for convert() - typedef ISO_8859<iso_iterator<>, iso_back_insert_iterator<>> ISO_8859_1; - typedef ISO_8859<iso_iterator<iso_8859_15_map>, iso_back_insert_iterator<iso_8859_15_map_reverse>> ISO_8859_15; -  - typedef UTF<utf_iterator<utf8_t>, utf_back_insert_iterator<utf8_t>> UTF_8; - typedef UTF<utf_iterator<char16_t>, utf_back_insert_iterator<char16_t>> UTF_16; - typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32; -   // std::distance doesn't work here: it is based on "output" distance of iterators   template<class Iterator>   inline size_t input_distance(const Iterator& it1, const Iterator& it2) @@ -729,7 +129,7 @@ namespace unicode {   // Optimize for the case of all ASCII (7-bit) data in a accu size row   // From and To are Encodings - template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true> + template<typename From, typename To, std::enable_if_t<is_encoding_v<From> && is_encoding_v<To>, bool> = true>   typename To::string_type convert_optimized(const typename From::string_type& s)   {    typename To::string_type result; @@ -774,20 +174,224 @@ namespace unicode {    return result;   } + template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<(sizeof(To) == 1), bool> = true> + inline void append_utf(std::basic_string<To>& result, const char32_t& value) + { +  using From = char32_t; +  if (bits_to_compare <= 7 || value < 0x80) { // 1 byte +   result.push_back(static_cast<To>(value)); +  } else if (bits_to_compare <= 11 || value < 0x800) { // 2 bytes +   result.append({utf8_byte_n_of_m<0,2,From,To>(value), utf8_byte_n_of_m<1,2,From,To>(value)}); +  } else if (bits_to_compare <= 16 || value < 0x10000) { // 3 bytes +   result.append({utf8_byte_n_of_m<0,3,From,To>(value), utf8_byte_n_of_m<1,3,From,To>(value), utf8_byte_n_of_m<2,3,From,To>(value)}); +  } else { // 4 bytes +   // expect value to be already valid Unicode values +   result.append({utf8_byte_n_of_m<0,4,From,To>(value), utf8_byte_n_of_m<1,4,From,To>(value), utf8_byte_n_of_m<2,4,From,To>(value), utf8_byte_n_of_m<3,4,From,To>(value)}); +  } + } + + template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<(sizeof(To) == 2), bool> = true> + inline void append_utf(std::basic_string<To>& result, const char32_t& value) + { +  if (bits_to_compare <= 16 || value <= 0xFFFF) { // expect value to be already valid Unicode values +   result.push_back(static_cast<To>(value)); +  } else { +   char32_t value_reduced{value - 0x10000}; +   result.append({static_cast<To>((value_reduced >> 10) + 0xD800), static_cast<To>((value_reduced & 0x3FF) + 0xDC00)}); +  } + } + + template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<(sizeof(To) == 4), bool> = true> + inline void append_utf(std::basic_string<To>& result, const char32_t& value) + { +  // expect value to be already valid Unicode values (checked in input iterator) +  result.push_back(static_cast<To>(value)); + } + + // Little Endian optimized version for UTF-8 + // In block_mode, at least 4 bytes are in accu. On first call, even 8. + // otherwise, at least one code unit is in accu + template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<(sizeof(From) == 1), bool> = true> + inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu) + { +#if 1 +  if (block_mode && bytes_in_accu == 8 && (accu & 0x8080808080808080) == 0) { +   result.append({ +                 static_cast<To>(accu & 0x7F), +                 static_cast<To>((accu >> 8) & 0x7F), +                 static_cast<To>((accu >> 16) & 0x7F), +                 static_cast<To>((accu >> 24) & 0x7F), +                 static_cast<To>((accu >> 32) & 0x7F), +                 static_cast<To>((accu >> 40) & 0x7F), +                 static_cast<To>((accu >> 48) & 0x7F), +                 static_cast<To>((accu >> 56) & 0x7F), +                 }); +   accu = 0; +   bytes_in_accu = 0; +  } else +#endif +  if ((accu & 0x80) == 0) { // 1 byte sequence +   append_utf<7>(result, static_cast<char32_t>(accu & 0x7F)); +   accu >>= 8; +   bytes_in_accu -= 1; +  } else if ((block_mode || bytes_in_accu >= 2) && (accu & 0xC0E0) == 0x80C0) { // 2 byte sequence +   char32_t value {static_cast<char32_t>(((accu & 0x1F) << 6) | ((accu >> 8) & 0x3f))}; +   accu >>= 16; +   bytes_in_accu -= 2; +   if (is_valid_unicode<11>(value)) +    append_utf<11>(result, value); +   else +    throw std::invalid_argument("Invalid Unicode character in 2 byte UTF-8 sequence"); +  } else if ((block_mode || bytes_in_accu >= 3) && (accu & 0xC0C0F0) == 0x8080E0) { // 3 byte sequence +   char32_t value {static_cast<char32_t>(((accu & 0x0F) << 12) | ((accu >> 2) & 0x0FC0) | ((accu >> 16) & 0x3f))}; +   accu >>= 24; +   bytes_in_accu -= 3; +   if (is_valid_unicode<16>(value)) +    append_utf<16>(result, value); +   else +    throw std::invalid_argument("Invalid Unicode character in 3 byte UTF-8 sequence"); +  } else if ((block_mode || bytes_in_accu >= 4) && (accu & 0xC0C0C0F8) == 0x808080F0) { // 4 byte sequence +   char32_t value {static_cast<char32_t>(((accu & 0x07) << 18) | ((accu << 4) & 0x3f000) | ((accu >> 10) & 0xFC0) | ((accu >> 24) & 0x3f))}; +   accu >>= 32; +   bytes_in_accu -= 4; +   if (is_valid_unicode<21>(value)) +    append_utf(result, value); +   else +    throw std::invalid_argument("Invalid Unicode character in 4 byte UTF-8 sequence"); +  } else +   throw std::invalid_argument("Invalid UTF-8 byte sequence"); + } + + // Little Endian optimized version for UTF-16 + // In block_mode, at least 4 bytes are in accu. On first call, even 8. + // otherwise, at least one code unit is in accu + template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<(sizeof(From) == 2), bool> = true> + inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu) + { +#if 1 +  if ((accu & 0xFF80FF80FF80FF80) == 0) { +   auto number_of_values{bytes_in_accu / sizeof(From)}; +   result.resize(result.size() + number_of_values); +   for (int i = 0; i < number_of_values; i++) { +    result[result.size() - number_of_values + i] = static_cast<To>(accu & 0x7F); +    accu >>= 16; +   } +   bytes_in_accu = 0; +  } else +#endif +  if ((accu & 0xFC00FC00FC00FC00) == 0xDC00D800DC00D800) { +   // found 4 code units forming 3 code points in UTF-16; +   // by definition of UTF-16, we have valid unicode values at this point +   if constexpr(sizeof(To) == 4) { +    //result.resize(result.size() + 2); +    //*reinterpret_cast<uint64_t*>(&result[result.size() - 2]) = (((accu & 0x03FF000003FF) << 10) | ((accu >> 16) & 0x03FF000003FF)) + 0x0001000000010000; +    result.append({ +                  static_cast<To>(((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000), +                  static_cast<To>(((accu & 0x03FF00000000) >> 22 | ((accu >> 48) & 0x03FF)) + 0x10000) +                  }); +   } else { +    append_utf(result, ((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000); +    append_utf(result, ((accu & 0x03FF00000000) >> 22 | ((accu >> 48) & 0x03FF)) + 0x10000); +   } +   accu = 0; +   bytes_in_accu = 0; +  } else +  if (From unit0 {static_cast<From>(accu & 0xFFFF)}; is_valid_unicode<16>(unit0)) { +   append_utf<16>(result, unit0); +   accu >>= 16; +   bytes_in_accu -= 2; +  } else +  if ((accu & 0xFC00FC00) == 0xDC00D800) { +   // found 2 code units forming 1 code point in UTF-16; +   // by definition of UTF-16, we have a valid unicode value at this point +   append_utf(result, ((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000); +   accu >>= 32; +   bytes_in_accu -= 4; +  } else +   throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); + } + + // Little Endian optimized version + template<typename From, typename To, std::enable_if_t<is_encoding_v<From> && is_encoding_v<To>, bool> = true> + typename To::string_type convert_optimized_utf(const typename From::string_type& s) + { +  typename To::string_type result; +  if constexpr(sizeof(typename From::value_type) == 4) { +   for (const auto value: s) { +    if (is_valid_unicode(value)) +     append_utf(result, value); +    else +     throw std::invalid_argument("Invalid Unicode character in UTF-32"); +   } +#if 0 +  } else if constexpr(sizeof(typename From::value_type) == 2) { +   for (int i = 0; i < s.size(); i++) { +    typename From::value_type unit0{s[i]}; +    if (is_valid_unicode(unit0)) { +     append_utf(result, unit0); +    } else { +     i++; +     if (i < s.size()) { +      typename From::value_type unit1 {s[i]}; +      if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) +       throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); + +      append_utf(result, (static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000); +     } else +      throw std::invalid_argument("Invalid code unit at end of UTF-16 string"); +    } +   } +#endif +  } else { +   uint64_t accu{}; +   int bytes_in_accu{}; + +   size_t s_index{}; +   size_t s_size{s.size()}; +   while (s_index + 8 / sizeof(typename From::value_type) <= s_size) { +    // read input +    // assume: bytes_in_accu < 8 +    accu |= (*reinterpret_cast<const uint64_t*>(&(s.data()[s_index]))) << (bytes_in_accu * 8); +    s_index += (8 - bytes_in_accu) / sizeof(typename From::value_type); +    bytes_in_accu = 8; + +    while (bytes_in_accu >= 4) { +     append_accu<typename From::value_type, typename To::value_type, true>(result, accu, bytes_in_accu); +    } +   } + +   // 0..3 bytes left in accu +   // 0..7 bytes left in s + +   while (s_index < s_size || bytes_in_accu > 0) { +    while (s_index < s_size && bytes_in_accu < 8) { +     accu |= static_cast<uint64_t>(*reinterpret_cast<const typename From::value_type*>(&(s.data()[s_index]))) << (bytes_in_accu * 8); +     ++s_index; +     bytes_in_accu += sizeof(typename From::value_type); +    } + +    append_accu<typename From::value_type, typename To::value_type, false>(result, accu, bytes_in_accu); +   } +  } +  return result; + } +   // From and To are Encodings - template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true> + template<typename From, typename To, std::enable_if_t<is_encoding_v<From> && is_encoding_v<To>, bool> = true>   typename To::string_type convert(const typename From::string_type& s)   {    // if input type == output type, only validate and return input, if appropriate    if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) && -               std::is_same_v<From, UTF<utf_iterator<typename From::value_type>, utf_back_insert_iterator<typename From::value_type>>> && -               std::is_same_v<To, UTF<utf_iterator<typename To::value_type>, utf_back_insert_iterator<typename To::value_type>>>) { +               is_utf_encoding_v<From> && is_utf_encoding_v<To>) {     if (validate_utf<typename From::value_type>(s)) {      return s;     } else {      throw std::invalid_argument("Invalid UTF input");     } -  } if constexpr(accu_size == 4 || accu_size == 8) { +  } else if constexpr(accu_size == 8 && is_little_endian() && sizeof(typename From::value_type) == 1 && +                      is_utf_encoding_v<From> && is_utf_encoding_v<To>) { // endian specific optimization +   return convert_optimized_utf<From, To>(s); +  } else if constexpr(accu_size == 4 || accu_size == 8) { // accu size specific optimization with speedup for 7bit input     return convert_optimized<From, To>(s);    } else {     typename To::string_type result; @@ -796,38 +400,11 @@ namespace unicode {    }   } - // Helper to get correct Encoding from char type, e.g. Encoding<typename decltype(s)::value_type>::type or Encoding_t<typename decltype(s)::value_type> - template<typename T> - struct Encoding - { - }; - - template<> - struct Encoding<utf8_t> - { -  typedef UTF_8 type; - }; - - template<> - struct Encoding<char16_t> - { -  typedef UTF_16 type; - }; - - template<> - struct Encoding<char32_t> - { -  typedef UTF_32 type; - }; - - template<typename T> - using Encoding_t = typename Encoding<T>::type; -   // From and To are from: utf8_t (i.e. char or char8_t (C++20)), char16_t and char32_t, char, wchar_t, uint8_t, uint16_t, uint32_t   template<typename From, typename To,    typename FromContainer=std::basic_string<From>,    typename ToContainer=std::basic_string<To>, -  std::enable_if_t<std::is_trivial<From>::value && std::is_scalar<From>::value && !std::is_empty<From>::value, bool> = true> +  std::enable_if_t<is_char_v<From> && is_char_v<To>, bool> = true>   ToContainer convert(const FromContainer& s)   {    typedef UTF<utf_iterator<From>, utf_back_insert_iterator<To>> UTF_Trait; @@ -841,7 +418,7 @@ namespace unicode {   // From and To are containers   template<typename FromContainer, typename ToContainer, -  std::enable_if_t<!std::is_empty<FromContainer>::value && !std::is_empty<ToContainer>::value, bool> = true +  std::enable_if_t<is_container_v<FromContainer> && is_container_v<ToContainer>, bool> = true   >   ToContainer convert(const FromContainer& s)   { @@ -855,7 +432,7 @@ namespace unicode {   }   // Container version - template<typename Container, std::enable_if_t<!std::is_empty<Container>::value, bool> = true> + template<typename Container, std::enable_if_t<is_container_v<Container>, bool> = true>   bool is_valid_utf(const Container& s)   {    typedef UTF<utf_iterator<typename Container::value_type, Container>, utf_back_insert_iterator<typename Container::value_type, Container>> UTF_Trait; @@ -871,7 +448,7 @@ namespace unicode {   // basic type version   template<typename T,    typename Container=std::basic_string<T>, -  std::enable_if_t<std::is_trivial<T>::value && !std::is_empty<T>::value, bool> = true> +  std::enable_if_t<is_char_v<T>, bool> = true>   bool is_valid_utf(const Container& s)   {    typedef UTF<utf_iterator<T>, utf_back_insert_iterator<T>> UTF_Trait; @@ -885,7 +462,7 @@ namespace unicode {   }   // Encoding version - template<typename Encoding, std::enable_if_t<std::is_empty<Encoding>::value, bool> = true> + template<typename Encoding, std::enable_if_t<is_encoding_v<Encoding>, bool> = true>   bool is_valid_utf(const typename Encoding::string_type& s)   {    return validate_utf<typename Encoding::value_type>(s); diff --git a/include/unicode/endian.h b/include/unicode/endian.h new file mode 100644 index 0000000..38bc1b7 --- /dev/null +++ b/include/unicode/endian.h @@ -0,0 +1,26 @@ +#pragma once + +#if __cplusplus >= 202002L +#include <bit> +#endif + +namespace unicode::detail { + +#if __cplusplus >= 202002L + consteval +#else + constexpr uint16_t endian_value{0x0102}; + constexpr uint8_t endian_value_1st_byte{(const uint8_t&)endian_value}; + + constexpr +#endif + bool is_little_endian() + { +#if __cplusplus >= 202002L +  return std::endian::native == std::endian::little; +#else +  return endian_value_1st_byte == 0x02; +#endif + } + +} // namespace unicode::detail diff --git a/include/unicode/iso.h b/include/unicode/iso.h new file mode 100644 index 0000000..9b20afd --- /dev/null +++ b/include/unicode/iso.h @@ -0,0 +1,189 @@ +#pragma once + +#include "types.h" + +#include <stdexcept> +#include <string> +#include <unordered_map> + +namespace unicode::detail { + + using namespace std::string_literals; + + typedef std::unordered_map<iso_t, char32_t> iso_map_type; + typedef std::unordered_map<char32_t, iso_t> iso_map_type_reverse; + + // ISO-8859-1 is lower 8-bit of Unicode, so no exceptions necessary + static inline iso_map_type iso_8859_1_map; + + // ISO-8859-15 is lower 8-bit of Unicode, except for: + static inline iso_map_type iso_8859_15_map { +  { '\xA4', U'\u20AC' }, // € +  { '\xA6', U'\u0160' }, // Š +  { '\xA8', U'\u0161' }, // š +  { '\xB4', U'\u017D' }, // Ž +  { '\xB8', U'\u017E' }, // ž +  { '\xBC', U'\u0152' }, // Œ +  { '\xBD', U'\u0153' }, // œ +  { '\xBE', U'\u0178' }, // Ÿ + }; + + inline iso_map_type_reverse reverse_iso_map(const iso_map_type& map) { +  iso_map_type_reverse result; +  std::for_each(map.cbegin(), map.cend(), +                [&](const iso_map_type::value_type& pair) +                 { +                  result.emplace(pair.second, pair.first); +                  result.emplace(static_cast<char32_t>(static_cast<uint8_t>(pair.first)), 0); // map invalid characters to a known non-mapped value as marker +                 }); +  return result; + } + + static inline iso_map_type_reverse iso_8859_15_map_reverse { reverse_iso_map(iso_8859_15_map) }; + static inline iso_map_type_reverse iso_8859_1_map_reverse { reverse_iso_map(iso_8859_1_map) }; + + template<unicode::detail::iso_map_type& Map=iso_8859_1_map, typename Container=std::basic_string<iso_t>> + struct iso_iterator { +  typedef iso_t value_type; +  typedef char32_t internal_type; +  typedef char32_t& reference; +  typedef char32_t* pointer; +  typedef size_t difference_type; +  typedef std::input_iterator_tag iterator_category; +  typedef typename Container::const_iterator iterator; +  typedef Container string_type; + +  iso_iterator(const iterator& it): m_it(it) {} + +  // pre-increment +  iso_iterator& operator++() +  { +   ++m_it; +   return *this; +  } + +  bool operator!=(const iso_iterator& other) const +  { +   return m_it != other.m_it; +  } + +  // return reference? +  internal_type operator*() const +  { +   value_type value{*m_it}; + +   if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 if needed +   { +    auto it{Map.find(value)}; +    if (it != Map.end()) +     return it->second; +   } +   return static_cast<internal_type>(static_cast<uint8_t>(value)); +  } + +  iso_iterator& operator+=(size_t distance) +  { +   std::advance(m_it, distance); +   return *this; +  } + +  difference_type operator-(const iso_iterator& other) const +  { +   return m_it - other.m_it; +  } + + private: +  iterator m_it; + }; + + template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse, typename Container=std::basic_string<iso_t>> + struct iso_back_insert_iterator { +  typedef iso_back_insert_iterator& reference; +  typedef iso_back_insert_iterator* pointer; +  typedef size_t difference_type; +  typedef iso_t value_type; +  typedef char32_t internal_type; +  typedef std::output_iterator_tag iterator_category; +  typedef Container string_type; +   +  iso_back_insert_iterator(string_type& s): s(s) {} + +  iso_back_insert_iterator& operator=(const iso_back_insert_iterator& other) +  { +   if (std::addressof(other.s) != std::addressof(s)) +    throw std::runtime_error("iso_back_insert_iterator assignment operator actually called! Iterator should not be assigned to."); + +   return *this; +  } + +  // no-op +  reference operator++() +  { +   return *this; +  } + +  // support *x = value, together with operator=() +  reference operator*() +  { +   return *this; +  } + +  reference operator=(const internal_type& value) +  { +   if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map_reverse)) // mapping back to 128 <= x <= 255 if needed +   { +    auto it{Map.find(value)}; +    if (it != Map.end()) { +     if (it->second == 0) // marker for non-mappable character found +      throw std::invalid_argument("Bad Unicode value to map to ISO 8859-15: "s + std::to_string(static_cast<uint32_t>(value))); +     s.push_back(it->second); +     return *this; +    } +   } + +   if (value > 255) +    throw std::invalid_argument("Bad ISO 8859 value above 255: "s + std::to_string(static_cast<uint32_t>(value))); + +   s.push_back(static_cast<typename iso_back_insert_iterator::value_type>(value)); +   return *this; +  } + + private: +  typename iso_back_insert_iterator::string_type& s; + }; + +} // namespace unicode::detail + +namespace unicode { + + using namespace detail; + + // Encoding for convert() and ISO-8859-* + template<typename InputIt, typename OutputIt> + struct ISO_8859 + { +  typedef iso_t value_type; +  typedef typename InputIt::string_type string_type; + +  static InputIt begin(const typename InputIt::string_type& s) +  { +   return InputIt(s.cbegin()); +  } + +  static InputIt end(const typename InputIt::string_type& s) +  { +   return InputIt(s.cend()); +  } + +  static OutputIt back_inserter(typename OutputIt::string_type& s) +  { +   return OutputIt(s); +  } + }; + + // Encoding for convert() + typedef ISO_8859<iso_iterator<>, iso_back_insert_iterator<>> ISO_8859_1; + typedef ISO_8859<iso_iterator<iso_8859_15_map>, iso_back_insert_iterator<iso_8859_15_map_reverse>> ISO_8859_15; + +} // namespace unicode + diff --git a/include/unicode/predicate.h b/include/unicode/predicate.h new file mode 100644 index 0000000..5f8c6a4 --- /dev/null +++ b/include/unicode/predicate.h @@ -0,0 +1,21 @@ +#pragma once + +namespace unicode { + + // bits_to_compare: limit bits to consider even further than defined by T + // T: usually, char32_t, uint32_t etc. + template<size_t bits_to_compare = 32, typename T> + static inline bool is_valid_unicode(const T& value) noexcept + { +  if constexpr(sizeof(T) == 1 || bits_to_compare <= 15) +   return true; +  else if constexpr(sizeof(T) == 2 || bits_to_compare <= 20) +   //return value <= 0xD7FF || value >= 0xE000; +   return (value & 0xF800) != 0xD800; +  else +   //return (value & 0xFFFFF800) != 0x0000D800 && (value >> 16) <= 0x10; +   return value <= 0xD7FF || (value >= 0xE000 && value <= 0x10FFFF); + } + +} // namespace unicode + diff --git a/include/unicode/type_traits.h b/include/unicode/type_traits.h new file mode 100644 index 0000000..3ee1d82 --- /dev/null +++ b/include/unicode/type_traits.h @@ -0,0 +1,77 @@ +#pragma once + +#include "utf.h" + +#include <string> +#include <type_traits> + +namespace unicode { + + using namespace detail; + + // helper traits +  + template<typename T> + struct is_encoding + { +  static const bool value{std::is_empty_v<T>}; + }; +  + template<typename T> + inline constexpr bool is_encoding_v {is_encoding<T>::value}; + + template<typename T> + struct is_container + { +  static const bool value{!std::is_empty_v<T>}; + }; +  + template<typename T> + inline constexpr bool is_container_v {is_container<T>::value}; + + template<typename T> + struct is_char + { +  static const bool value{std::is_trivial_v<T> && std::is_scalar_v<T> && !std::is_empty_v<T>}; + }; +  + template<typename T> + inline constexpr bool is_char_v {is_char<T>::value}; + + template<typename T> + struct is_utf_encoding + { +  static const bool value{std::is_same_v<T, UTF<utf_iterator<typename T::value_type>, utf_back_insert_iterator<typename T::value_type>>>}; + }; + + template<typename T> + inline constexpr bool is_utf_encoding_v {is_utf_encoding<T>::value}; + + template<typename T> + struct is_utf_8 + { +  static const bool value{std::is_trivial_v<T> && sizeof(T) == 1}; + }; +  + template<typename T> + inline constexpr bool is_utf_8_v {is_utf_8<T>::value}; + + template<typename T> + struct is_utf_16 + { +  static const bool value{std::is_trivial_v<T> && sizeof(T) == 2}; + }; +  + template<typename T> + inline constexpr bool is_utf_16_v {is_utf_16<T>::value}; + + template<typename T> + struct is_utf_32 + { +  static const bool value{std::is_trivial_v<T> && sizeof(T) == 4}; + }; +  + template<typename T> + inline constexpr bool is_utf_32_v {is_utf_32<T>::value}; + +} // namespace unicode diff --git a/include/unicode/types.h b/include/unicode/types.h new file mode 100644 index 0000000..a4461d7 --- /dev/null +++ b/include/unicode/types.h @@ -0,0 +1,10 @@ +#pragma once + +#ifdef __cpp_char8_t +// char8_t available + typedef char8_t utf8_t; +#else + typedef char utf8_t; +#endif +typedef char iso_t; + diff --git a/include/unicode/utf.h b/include/unicode/utf.h new file mode 100644 index 0000000..dd504a7 --- /dev/null +++ b/include/unicode/utf.h @@ -0,0 +1,448 @@ +#pragma once + +#include <list> +#include <string> +#include <stdexcept> + +namespace unicode::detail { + + using namespace std::string_literals; + + template<size_t sequence_length, typename value_type> + inline bool is_utf8_leading_byte(value_type byte) noexcept + { +  static_assert(sequence_length <= 4); + +  if constexpr(sequence_length == 1) { +   return !(byte & 0x80); +  } else { +   return (byte & static_cast<value_type>(0xFF << (7 - sequence_length))) == static_cast<value_type>(0xFF << (8 - sequence_length)); +  } + } + + template<typename value_type> + inline bool is_utf8_followup_byte(value_type b) noexcept + { +  return (b & 0b11000000) == 0b10000000; + } + + template<typename value_type, typename... Tbytes> + inline bool is_utf8_sequence(value_type byte0, Tbytes... bytes) noexcept + { +  constexpr auto sequence_length{sizeof...(Tbytes) + 1}; + +  static_assert(sequence_length <= 4, "UTF-8 sequences of 1 through 4 code units are supported"); + +  return is_utf8_leading_byte<sequence_length>(byte0) && +         (... && is_utf8_followup_byte(bytes)); // left fold for linear evaluation from left to right + } + + template<typename T, typename std::enable_if_t<(sizeof(T) == 1), bool> = true> + inline bool validate_utf(const std::basic_string<T>& s) + { +  int i{}; +  auto size{s.size()}; +  while (i < size) { +   if (is_utf8_sequence(s[i])) { +    i++; +   } else if ((i < size - 1) && is_utf8_sequence(s[i], s[i + 1])) { +    i += 2; +   } else if ((i < size - 2) && is_utf8_sequence(s[i], s[i + 1], s[i + 2])) { +    if (((s[i] & 0xF) == 0xD) && ((s[i + 1] & 0x20) == 0x20)) +     return false; // Reserved for UTF-16 surrogates: 0xD800..0xDFFF +    i += 3; +   } else if ((i < size - 3) && is_utf8_sequence(s[i], s[i + 1], s[i + 2], s[i + 3])) { +    if ((((s[i] & 7) << 2) | ((s[i + 1] >> 4) & 3)) >= 0x11) +     return false; // Unicode too big above 0x10FFFF +    i += 4; +   } else { +    return false; +   } +  } +  return true; + } + + template<typename value_type, typename... Twords> + inline bool is_utf16_sequence(value_type word0, Twords... words) noexcept + { +  constexpr auto sequence_length{sizeof...(Twords) + 1}; + +  static_assert(sequence_length <= 2, "UTF-16 sequences of only 1 or 2 code units are supported"); + +  if constexpr(sequence_length == 1) { +   return is_valid_unicode(word0); +  } else { +   char16_t unit0 {static_cast<char16_t>(word0)}; +   char16_t unit1 {static_cast<char16_t>((words, ...))}; +   return (unit0 & 0xFC00) == 0xD800 && (unit1 & 0xFC00) == 0xDC00; +  } + } + + template<typename T, typename std::enable_if_t<(sizeof(T) == 2), bool> = true> + inline bool validate_utf(const std::basic_string<T>& s) + { +  int i{}; +  auto size{s.size()}; +  while (i < size) { +   if (is_utf16_sequence(s[i])) { +    i++; +   } else if ((i < size - 1) && is_utf16_sequence(s[i], s[i + 1])) { +    i += 2; +   } else { +    return false; +   } +  } +  return true; + } + + template<typename T, typename std::enable_if_t<(sizeof(T) == 4), bool> = true> + inline bool validate_utf(const std::basic_string<T>& s) + { +  for (auto i: s) +   if (!is_valid_unicode(i)) +    return false; +  return true; + } + + template<size_t sequence_length, typename value_type> + inline char32_t decode_utf8_leading_byte(value_type b) noexcept + { +  return static_cast<char32_t>(b & (0b1111111 >> sequence_length)) << ((sequence_length - 1) * 6); + } + + template<typename value_type> + inline char32_t decode_utf8_followup_byte(value_type b) noexcept + { +  return static_cast<char32_t>(b & 0b00111111); + } + + template<typename value_type, typename... Targs> + inline char32_t decode_utf8_followup_byte(value_type b, Targs... bytes) noexcept + { +  return decode_utf8_followup_byte(b) << (6 * sizeof...(Targs)) | decode_utf8_followup_byte(bytes...); + } + + template<typename value_type, typename... Targs> + inline char32_t decode_utf8_sequence(value_type b, Targs... bytes) noexcept + { +  size_t constexpr sequence_length{sizeof...(Targs) + 1}; + +  static_assert(sequence_length <= 4); + +  if constexpr (sequence_length == 1) +   return b; +  else +   return decode_utf8_leading_byte<sequence_length>(b) | decode_utf8_followup_byte(bytes...); + } + + template<typename T, typename Container=std::basic_string<T>> + struct utf_iterator + { +  static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); + +  typedef T value_type; +  typedef char32_t internal_type; +  typedef char32_t& reference; +  typedef char32_t* pointer; +  typedef size_t difference_type; +  typedef std::input_iterator_tag iterator_category; +  typedef Container string_type; + +  utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend): +   iterator(cbegin), end_iterator(cend) +  { +  } + +  utf_iterator(const utf_iterator& other) = default; +  utf_iterator& operator=(const utf_iterator& other) = default; + +  inline size_t remaining_code_units() const noexcept +  { +   return std::distance(iterator, end_iterator); +  } + +  template<size_t index> +  inline value_type get_code_unit() const noexcept +  { +   if constexpr (std::is_same_v<Container, typename std::list<value_type>>) { +    // std::list doesn't support it + n +    auto it{iterator}; +    std::advance(it, index); +    return *it; +   } else { +    return *(iterator + index); +   } +  } + +  template<typename... Tbytes> +  inline internal_type calculate_utf8_value(Tbytes... bytes) +  { +   size_t constexpr sequence_length{sizeof...(Tbytes)}; +   static_assert(sequence_length >= 1 && sequence_length <= 4); + +   if constexpr(sequence_length > 1) { +    if (remaining_code_units() < sequence_length) +     throw std::invalid_argument("Bad input: Not enough bytes left for decoding UTF-8 sequence"); +   } + +   if (is_utf8_sequence(bytes...)) { +    std::advance(iterator, sequence_length); +    internal_type result{decode_utf8_sequence(bytes...)}; +    if (!unicode::is_valid_unicode<sequence_length * 6>(result)) +     throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result))); +    return result; +   } else { +    if constexpr(sequence_length <= 3) // template recursion break condition: UTF-8 has 1..4 code units +     return calculate_utf8_value(bytes..., static_cast<utf8_t>(get_code_unit<sequence_length>())); +    else +     throw std::invalid_argument("Bad UTF-8 input: Invalid 4 byte sequence"); +   } +  } + +  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true> +  inline internal_type calculate_value() +  { +   return calculate_utf8_value(static_cast<utf8_t>(get_code_unit<0>())); +  } + +  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true> +  inline internal_type calculate_value() +  { +   char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())}; + +   if (is_valid_unicode(unit0)) { // 1 unit (BMP Basic Multilingual Plane) +    std::advance(iterator, 1); +    return unit0; +   } else { +    if (remaining_code_units() < 2) +     throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); + +    char16_t unit1 {static_cast<char16_t>(get_code_unit<1>())}; +    if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) +     throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); + +    std::advance(iterator, 2); +    return (static_cast<internal_type>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000; +   } +  } + +  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true> +  inline internal_type calculate_value() +  { +   internal_type result {static_cast<internal_type>(get_code_unit<0>())}; + +   if (!unicode::is_valid_unicode(result)) +    throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result))); + +   std::advance(iterator, 1); + +   return result; +  } + +  // pre-increment +  utf_iterator& operator++() +  { +   return *this; +  } + +  bool operator!=(const utf_iterator& other) const +  { +   return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator); +  } + +  internal_type operator*() +  { +   return calculate_value(); +  } + +  utf_iterator& operator+=(size_t distance) +  { +   std::advance(iterator, distance); +   return *this; +  } + +  size_t operator-(const utf_iterator& other) const +  { +   return iterator - other.iterator; +  } + + private: +  typename string_type::const_iterator iterator; +  typename string_type::const_iterator end_iterator; + }; + + // n is number of UTF-8 bytes in sequence + template<size_t n, typename From, typename To> + inline To utf8_byte0_of(const From& value) + { +  return (value >> 6 * (n - 1)) | (0xFF << (8 - n)); + } + + // n is index of 6-bit groups, counting from bit 0 + template<size_t n, typename From, typename To> + inline To utf8_trailing_byte(const From& value) + { +  return ((value >> n * 6) & 0b111111) | 0b10000000; + } + + // calculate UTF-8 sequence byte for m >= 2 bytes sequences (i.e. non-ASCII) + // assume value to be valid Unicode value for given byte position + template<size_t n, size_t m, typename From, typename To> + inline To utf8_byte_n_of_m(const From& value) + { +  if constexpr (n == 0) +   return utf8_byte0_of<m, From, To>(value); +  else +   return utf8_trailing_byte<m - n - 1, From, To>(value); + } + + template<typename T, typename Container=std::basic_string<T>> + struct utf_back_insert_iterator + { +  static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); + +  typedef T value_type; +  typedef char32_t internal_type; +  typedef Container string_type; +  typedef utf_back_insert_iterator& reference; +  typedef utf_back_insert_iterator* pointer; +  typedef size_t difference_type; +  typedef std::output_iterator_tag iterator_category; + +  utf_back_insert_iterator(string_type& s): s(s) {} + +  utf_back_insert_iterator& operator=(const utf_back_insert_iterator& other) +  { +   if (std::addressof(other.s) != std::addressof(s)) +    throw std::runtime_error("utf_back_insert_iterator assignment operator actually called! Iterator should not be assigned to."); + +   return *this; +  } + +  // no-op +  reference operator++() +  { +   return *this; +  } + +  // support *x = value, together with operator=() +  reference operator*() +  { +   return *this; +  } + +  template<typename... Args> +  inline void append(Args&&... args) +  { +   if constexpr (std::is_same_v<Container, typename std::basic_string<value_type>>) { +    s.append({args...}); +   } else { +    (s.emplace_back(args), ...); +   } +  } + +  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true> +  inline void append_utf(const internal_type& value) +  { +   using Y = internal_type; +   if (value < 0x80) { // 1 byte +    append(static_cast<value_type>(value)); +   } else if (value < 0x800) { // 2 bytes +    append(utf8_byte_n_of_m<0,2,Y,X>(value), utf8_byte_n_of_m<1,2,Y,X>(value)); +   } else if (value < 0x10000) { // 3 bytes +    append(utf8_byte_n_of_m<0,3,Y,X>(value), utf8_byte_n_of_m<1,3,Y,X>(value), utf8_byte_n_of_m<2,3,Y,X>(value)); +   } else { // 4 bytes +    // expect value to be already valid Unicode values (checked in input iterator) +    append(utf8_byte_n_of_m<0,4,Y,X>(value), utf8_byte_n_of_m<1,4,Y,X>(value), utf8_byte_n_of_m<2,4,Y,X>(value), utf8_byte_n_of_m<3,4,Y,X>(value)); +   } +  } + +  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true> +  inline void append_utf(const internal_type& value) +  { +   if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator) +    append(static_cast<value_type>(value)); +   } else { +    internal_type value_reduced{value - 0x10000}; +    append(static_cast<value_type>((value_reduced >> 10) + 0xD800), static_cast<value_type>((value_reduced & 0x3FF) + 0xDC00)); +   } +  } + +  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true> +  inline void append_utf(const internal_type& value) +  { +   // expect value to be already valid Unicode values (checked in input iterator) +   append(static_cast<value_type>(value)); +  } + +  reference operator=(const internal_type& value) +  { +   append_utf(value); +   return *this; +  } + + private: +  typename utf_back_insert_iterator::string_type& s; + }; + +} // namespace unicode::detail + +namespace unicode { +  + // Encoding for convert() and UTF-* + template<typename InputIt, typename OutputIt> + struct UTF + { +  typedef typename OutputIt::value_type value_type; +  typedef typename InputIt::string_type string_type; + +  static InputIt begin(const typename InputIt::string_type& s) +  { +   return InputIt{s.cbegin(), s.cend()}; +  } + +  static InputIt end(const typename InputIt::string_type& s) +  { +   return InputIt{s.cend(), s.cend()}; +  } + +  static OutputIt back_inserter(typename OutputIt::string_type& s) +  { +   return OutputIt(s); +  } + }; + + // Encoding for convert() + typedef UTF<utf_iterator<utf8_t>, utf_back_insert_iterator<utf8_t>> UTF_8; + typedef UTF<utf_iterator<char16_t>, utf_back_insert_iterator<char16_t>> UTF_16; + typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32; + + // Helper to get correct Encoding from char type, e.g. Encoding<typename decltype(s)::value_type>::type or Encoding_t<typename decltype(s)::value_type> + template<typename T> + struct Encoding + { + }; + + template<> + struct Encoding<utf8_t> + { +  typedef UTF_8 type; + }; + + template<> + struct Encoding<char16_t> + { +  typedef UTF_16 type; + }; + + template<> + struct Encoding<char32_t> + { +  typedef UTF_32 type; + }; + + template<typename T> + using Encoding_t = typename Encoding<T>::type; + +} // namespace unicode + | 
