diff options
| -rw-r--r-- | include/unicode.h | 51 | 
1 files changed, 32 insertions, 19 deletions
| diff --git a/include/unicode.h b/include/unicode.h index 4e4c7eb..2d7bf71 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -1,6 +1,6 @@  // libunicode  // -// Author: Roland Reichwein +// Author: Roland Reichwein <mail@reichwein.it>  //  // Available under the conditions of CC0 1.0 Universal  // https://creativecommons.org/publicdomain/zero/1.0/ @@ -104,11 +104,11 @@ namespace unicode::detail {   template<typename value_type, typename... Twords>   inline bool is_utf16_sequence(value_type word0, Twords... words) noexcept   { -  constexpr auto n{sizeof...(Twords) + 1}; +  constexpr auto sequence_length{sizeof...(Twords) + 1}; -  static_assert(n <= 2, "UTF-16 sequences of only 1 or 2 code units are supported"); +  static_assert(sequence_length <= 2, "UTF-16 sequences of only 1 or 2 code units are supported"); -  if constexpr(n == 1) { +  if constexpr(sequence_length == 1) {     return is_valid_unicode(word0);    } else {     char16_t unit0 {static_cast<char16_t>(word0)}; @@ -143,22 +143,35 @@ namespace unicode::detail {    return true;   } + template<size_t sequence_length, typename value_type> + inline char32_t decode_utf8_leading_byte(value_type b) noexcept + { +  return static_cast<char32_t>(b & (0b1111111 >> sequence_length)) << ((sequence_length - 1) * 6); + } +   template<typename value_type> - inline char32_t continuation_value(value_type b) noexcept + inline char32_t decode_utf8_followup_byte(value_type b) noexcept   {    return static_cast<char32_t>(b & 0b00111111);   }   template<typename value_type, typename... Targs> - inline char32_t continuation_value(value_type b, Targs... Fargs) noexcept + inline char32_t decode_utf8_followup_byte(value_type b, Targs... bytes) noexcept   { -  return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...); +  return decode_utf8_followup_byte(b) << (6 * sizeof...(Targs)) | decode_utf8_followup_byte(bytes...);   } - template<size_t n, typename value_type> - inline char32_t value_byte0_of(value_type b) noexcept + template<typename value_type, typename... Targs> + inline char32_t decode_utf8_sequence(value_type b, Targs... bytes) noexcept   { -  return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6); +  size_t constexpr sequence_length{sizeof...(Targs) + 1}; + +  static_assert(sequence_length <= 4); + +  if constexpr (sequence_length == 1) +   return b; +  else +   return decode_utf8_leading_byte<sequence_length>(b) | decode_utf8_followup_byte(bytes...);   }   template<typename T, typename Container=std::basic_string<T>> @@ -206,32 +219,32 @@ namespace unicode::detail {     utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};     if (is_utf8_sequence(byte0)) { // 1 byte: 7 bit ASCII      std::advance(iterator, 1); -    return byte0; +    return decode_utf8_sequence(byte0);     } else {      internal_type value{};      if (size_t remaining{remaining_code_units()}; remaining >= 2) {       utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())};       if (is_utf8_sequence(byte0, byte1)) { // 2 bytes -      value = value_byte0_of<2>(byte0) | continuation_value(byte1); +      value = decode_utf8_sequence(byte0, byte1);        std::advance(iterator, 2);       } else if (remaining >= 3) {        utf8_t byte2 {static_cast<utf8_t>(get_code_unit<2>())};        if (is_utf8_sequence(byte0, byte1, byte2)) { // 3 bytes -       value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2); +       value = decode_utf8_sequence(byte0, byte1, byte2);         std::advance(iterator, 3);        } else if (remaining >= 4) {         utf8_t byte3 {static_cast<utf8_t>(get_code_unit<3>())};         if (is_utf8_sequence(byte0, byte1, byte2, byte3)) { // 4 bytes -        value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3); +        value = decode_utf8_sequence(byte0, byte1, byte2, byte3);          std::advance(iterator, 4);         } else          throw std::invalid_argument("Bad input: Invalid 4 byte sequence");        } else -       throw std::invalid_argument("Bad input: Invalid 3 byte sequence"); +       throw std::invalid_argument("Bad UTF-8 input: 4th byte expected, none found (end of sequence)");       } else -      throw std::invalid_argument("Bad input: Invalid 2 byte sequence"); +      throw std::invalid_argument("Bad UTF-8 input: 3rd byte expected, none found (end of sequence)");      } else -     throw std::invalid_argument("Bad input: 2nd byte expected, none found"); +     throw std::invalid_argument("Bad UTF-8 input: 2nd byte expected, none found (end of sequence)");      // check only for sequences >= 2 bytes (ASCII is always compliant)      if (!unicode::is_valid_unicode(value)) @@ -782,13 +795,13 @@ namespace unicode {   typename To::string_type convert(const typename From::string_type& s)   {    // if input type == output type, only validate and return input, is appropriate -  if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) == 1 && +  if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) &&                 std::is_same_v<From, UTF<utf_iterator<typename From::value_type>, utf_back_insert_iterator<typename From::value_type>>> &&                 std::is_same_v<To, UTF<utf_iterator<typename To::value_type>, utf_back_insert_iterator<typename To::value_type>>>) {     if (validate_utf<typename From::value_type>(s)) {      return s;     } else { -    throw std::invalid_argument("Invalid UTF-8"); +    throw std::invalid_argument("Invalid UTF input");     }    } if constexpr(accu_size == 4 || accu_size == 8) {     return convert_optimized<From, To>(s); | 
