diff options
Diffstat (limited to 'include')
| -rw-r--r-- | include/unicode.h | 257 | 
1 files changed, 211 insertions, 46 deletions
| diff --git a/include/unicode.h b/include/unicode.h index 512891a..a55eac3 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -15,66 +15,164 @@  namespace { - struct utf8_iterator + using namespace std::string_literals; + + template<typename T> + struct utf_iterator   {    typedef char32_t value_type;    typedef char32_t& reference; +  typedef std::basic_string<T> string_type; -  utf8_iterator(const std::u8string::const_iterator& cbegin, const std::u8string::const_iterator& cend): +  utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend):     iterator(cbegin), end_iterator(cend)    { -   calculate_value(); +   calculate_value<T>(); +  } + +  utf_iterator<T>(const utf_iterator<T>& other) = default; +  utf_iterator<T>& operator=(const utf_iterator<T>& other) = default; + +  size_t remaining_code_units() +  { +   return end_iterator - iterator;    } -  utf8_iterator(const utf8_iterator& other) = default; -  utf8_iterator& operator=(const utf8_iterator& other) = default; +  template<size_t index> +  T get_code_unit() +  { +   return *(iterator + index); +  }    // set value member +  // default: char32_t for UTF-32 +  // specializations for UTF-8 and UTF-16 below +  template<typename T1>    void calculate_value()    { -   if (iterator == end_iterator) +   size_t remaining{remaining_code_units()}; + +   if (!remaining) +    return; + +   value = get_code_unit<0>(); +    +   if (value > 0x10FFFF || (value > 0xD7FF && value < 0xE000)) +    throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value))); + +   sequence_length = 1; +  } + +  inline static bool is_continuation_byte(T b) +  { +   return (b & 0b11000000) == 0b10000000; +  } + +  template<typename... Targs> +  inline static bool is_continuation_byte(T b, Targs... Fargs) +  { +   return is_continuation_byte(b) && is_continuation_byte(Fargs...); +  } + +  template<size_t n> +  inline static bool is_byte0_of(T b) +  { +   return (b & static_cast<T>(0xFF << (7 - n))) == static_cast<T>(0xFF << (8 - n)); +  } + +  inline static char32_t continuation_value(T b) +  { +   return static_cast<char32_t>(b & 0b00111111); +  } + +  template<typename... Targs> +  inline static char32_t continuation_value(T b, Targs... Fargs) +  { +   return continuation_value(b) << 6 | continuation_value(Fargs...); +  } + +  template<size_t n> +  inline static char32_t value_byte0_of(T b) +  { +   return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6); +  } + +  // specialization for UTF-8 +  template<> +  void calculate_value<char8_t>() +  { +   size_t remaining{remaining_code_units()}; +    +   if (!remaining)      return; -   char8_t first_byte {*iterator}; -   if (first_byte & 0x80) { // 2-4 bytes -    if (iterator + 1 != end_iterator) { -     char8_t second_byte {*(iterator + 1)}; -     if ((first_byte & 0b11100000) == 0b11000000 && (second_byte & 0b11000000) == 0b10000000) { // 2 bytes -      value = char32_t(first_byte & 0b11111) << 6 | (second_byte & 0b111111); +   char8_t byte0 {get_code_unit<0>()}; +   if (byte0 & 0x80) { // 2-4 bytes +    if (remaining >= 2) { +     char8_t byte1 {get_code_unit<1>()}; +     if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes +      value = value_byte0_of<2>(byte0) | continuation_value(byte1);        sequence_length = 2; -     } else if (iterator + 2 != end_iterator) { -      char8_t third_byte {*(iterator + 2)}; -      if ((first_byte & 0b11110000) == 0b11100000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000) { // 3 bytes -       value = char32_t(first_byte & 0b1111) << 12 | char32_t(second_byte & 0b111111) << 6 | (third_byte & 0b111111); +     } else if (remaining >= 3) { +      char8_t byte2 {get_code_unit<2>()}; +      if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes +       value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2);         sequence_length = 3; -      } else if (iterator + 3 != end_iterator) { -       char8_t fourth_byte {*(iterator + 3)}; -       if ((first_byte & 0b11111000) == 0b11110000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000 && (fourth_byte & 0b11000000) == 0b10000000) { // 4 bytes -        value = char32_t(first_byte & 0b111) << 18 | char32_t(second_byte & 0b111111) << 12 | char32_t(third_byte & 0b111111) << 6 | (fourth_byte & 0b111111); +      } else if (remaining >= 4) { +       char8_t byte3 {get_code_unit<3>()}; +       if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes +        value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3);          sequence_length = 4;         } else -        throw std::invalid_argument("bad input: invalid 4 byte sequence"); +        throw std::invalid_argument("Bad input: Invalid 4 byte sequence");        } else -       throw std::invalid_argument("bad input: invalid 3 byte sequence"); +       throw std::invalid_argument("Bad input: Invalid 3 byte sequence");       } else -      throw std::invalid_argument("bad input: invalid 2 byte sequence"); +      throw std::invalid_argument("Bad input: Invalid 2 byte sequence");      } else -     throw std::invalid_argument("bad input: byte 2 expected, none found"); +     throw std::invalid_argument("Bad input: 2nd byte expected, none found");     } else { // 1 byte: 7 bit ASCII -    value = first_byte; +    value = byte0;      sequence_length = 1;     }    } +  // specialization for UTF-16 +  template<> +  void calculate_value<char16_t>() +  { +   size_t remaining{remaining_code_units()}; +    +   if (!remaining) +    return; + +   char16_t unit0 {get_code_unit<0>()}; + +   if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane) +    value = unit0; +    sequence_length = 1; +   } else { +    if (remaining < 2) +     throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); + +    char16_t unit1 {get_code_unit<1>()}; +    if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) +     throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); + +    value = static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF); +    sequence_length = 2; +   } +  } +    // pre-increment -  utf8_iterator& operator++() +  utf_iterator<T>& operator++()    {     iterator += sequence_length; -   calculate_value(); +   calculate_value<T>();     return *this;    } -  bool operator!=(const utf8_iterator& other) const +  bool operator!=(const utf_iterator<T>& other) const    {     return iterator != other.iterator;    } @@ -84,21 +182,23 @@ namespace {     return value;    } -  std::u8string::const_iterator iterator; -  std::u8string::const_iterator end_iterator; +  typename string_type::const_iterator iterator; +  typename string_type::const_iterator end_iterator;    value_type value{};    size_t sequence_length{};   }; - struct utf16_back_insert_iterator + template<typename T> + struct utf_back_insert_iterator   { -  typedef utf16_back_insert_iterator& reference; +  typedef std::basic_string<T> string_type; +  typedef utf_back_insert_iterator& reference; -  utf16_back_insert_iterator(std::u16string& s): s(s) {} +  utf_back_insert_iterator(string_type& s): s(s) {}    // no-op -  utf16_back_insert_iterator& operator++() +  utf_back_insert_iterator& operator++()    {     return *this;    } @@ -109,10 +209,71 @@ namespace {     return *this;    } -  // append utf-16 word sequence +  // default: utf-32 code unit for UTF-32 +  // specializations for UTF-8 and UTF-16 below +  template<typename T1=T>    reference operator=(const char32_t& value)    { -   if (value <= 0xFFFF) { // expect value to be already valid Unicode values, TODO: validate char32_t! +   // expect value to be already valid Unicode values +   s.push_back(value); +   return *this; +  } + +  // n is number of UTF-8 bytes in sequence +  template<size_t n> +  inline static T byte0_of(char32_t value) +  { +   return (value >> 6 * (n - 1)) | (0xFF << (8 - n)); +  } + +  // n is index of 6-bit groups, counting from bit 0 +  template<size_t n> +  inline static T trailing_byte(char32_t value) +  { +   return ((value >> n * 6) & 0b111111) | 0b10000000; +  } + +  // calculate UTF-8 sequence byte for m >= 2 bytes sequences (i.e. non-ASCII) +  // assume value to be valid Unicode value for given byte position +  template<size_t n, size_t m> +  inline static T byte_n_of_m(char32_t value) +  { +   if constexpr (n == 0) +    return byte0_of<m>(value); +   else +    return trailing_byte<m - n - 1>(value); +  } + +  // specialization for UTF-8 +  // append utf-8 byte sequence +  template<> +  reference operator=<char8_t>(const char32_t& value) +  { +   if (value < 0x80) { // 1 byte +    s.push_back(value); +   } else if (value < 0x800) { // 2 bytes +    s.push_back(byte_n_of_m<0,2>(value)); +    s.push_back(byte_n_of_m<1,2>(value)); +   } else if (value < 0x10000) { // 3 bytes +    s.push_back(byte_n_of_m<0,3>(value)); +    s.push_back(byte_n_of_m<1,3>(value)); +    s.push_back(byte_n_of_m<2,3>(value)); +   } else if (value < 0x110000) { // 4 bytes +    s.push_back(byte_n_of_m<0,4>(value)); +    s.push_back(byte_n_of_m<1,4>(value)); +    s.push_back(byte_n_of_m<2,4>(value)); +    s.push_back(byte_n_of_m<3,4>(value)); +   } else +    throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast<uint32_t>(value))); +   return *this; +  } + +  // specialization for UTF-16 +  // append utf-16 word sequence +  template<> +  reference operator=<char16_t>(const char32_t& value) +  { +   if (value <= 0xFFFF) { // expect value to be already valid Unicode values      s.push_back(value);     } else {      s.push_back((value >> 10) + 0xD800); @@ -121,33 +282,37 @@ namespace {     return *this;    } -  std::u16string& s; +  typename utf_back_insert_iterator::string_type& s;   }; - utf16_back_insert_iterator utf16_back_inserter(std::u16string& s) + template<typename T> + utf_back_insert_iterator<T> utf_back_inserter(std::basic_string<T>& s)   { -  return utf16_back_insert_iterator(s); +  return utf_back_insert_iterator<T>(s);   } - utf8_iterator utf8_begin(const std::u8string& s) + template<typename T> + utf_iterator<T> utf_begin(const std::basic_string<T>& s)   { -  return utf8_iterator{s.cbegin(), s.cend()}; +  return utf_iterator<T>{s.cbegin(), s.cend()};   } - utf8_iterator utf8_end(const std::u8string& s) + template<typename T> + utf_iterator<T> utf_end(const std::basic_string<T>& s)   { -  return utf8_iterator{s.cend(), s.cend()}; +  return utf_iterator<T>{s.cend(), s.cend()};   }  } // namespace  namespace unicode { -std::u16string utf8_to_utf16(const std::u8string& s) +template<typename From, typename To> +std::basic_string<To> utf_to_utf(const std::basic_string<From>& s)  { - std::u16string result; + std::basic_string<To> result; - std::copy(utf8_begin(s), utf8_end(s), utf16_back_inserter(result)); + std::copy(utf_begin<From>(s), utf_end<From>(s), utf_back_inserter<To>(result));   return result;  } | 
