diff options
| -rw-r--r-- | include/unicode.h | 90 | 
1 files changed, 35 insertions, 55 deletions
| diff --git a/include/unicode.h b/include/unicode.h index 2d7bf71..8ac9f55 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -27,13 +27,14 @@ typedef char iso_t;  namespace unicode { - // usually, char32_t, uint32_t etc. - template<typename T> + // bits_to_compare: limit bits to consider even further than defined by T + // T: usually, char32_t, uint32_t etc. + template<size_t bits_to_compare = 32, typename T>   static inline bool is_valid_unicode(const T& value) noexcept   { -  if constexpr(sizeof(T) == 1) +  if constexpr(sizeof(T) == 1 || bits_to_compare <= 15)     return true; -  else if constexpr(sizeof(T) == 2) +  else if constexpr(sizeof(T) == 2 || bits_to_compare <= 20)     //return value <= 0xD7FF || value >= 0xE000;     return (value & 0xF800) != 0xD800;    else @@ -213,45 +214,35 @@ namespace unicode::detail {     }    } -  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true> -  inline internal_type calculate_value() +  template<typename... Tbytes> +  inline internal_type calculate_utf8_value(Tbytes... bytes)    { -   utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())}; -   if (is_utf8_sequence(byte0)) { // 1 byte: 7 bit ASCII -    std::advance(iterator, 1); -    return decode_utf8_sequence(byte0); -   } else { -    internal_type value{}; -    if (size_t remaining{remaining_code_units()}; remaining >= 2) { -     utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())}; -     if (is_utf8_sequence(byte0, byte1)) { // 2 bytes -      value = decode_utf8_sequence(byte0, byte1); -      std::advance(iterator, 2); -     } else if (remaining >= 3) { -      utf8_t byte2 {static_cast<utf8_t>(get_code_unit<2>())}; -      if (is_utf8_sequence(byte0, byte1, byte2)) { // 3 bytes -       value = decode_utf8_sequence(byte0, byte1, byte2); -       std::advance(iterator, 3); -      } else if (remaining >= 4) { -       utf8_t byte3 {static_cast<utf8_t>(get_code_unit<3>())}; -       if (is_utf8_sequence(byte0, byte1, byte2, byte3)) { // 4 bytes -        value = decode_utf8_sequence(byte0, byte1, byte2, byte3); -        std::advance(iterator, 4); -       } else -        throw std::invalid_argument("Bad input: Invalid 4 byte sequence"); -      } else -       throw std::invalid_argument("Bad UTF-8 input: 4th byte expected, none found (end of sequence)"); -     } else -      throw std::invalid_argument("Bad UTF-8 input: 3rd byte expected, none found (end of sequence)"); -    } else -     throw std::invalid_argument("Bad UTF-8 input: 2nd byte expected, none found (end of sequence)"); -   -    // check only for sequences >= 2 bytes (ASCII is always compliant) -    if (!unicode::is_valid_unicode(value)) -     throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value))); +   size_t constexpr sequence_length{sizeof...(Tbytes)}; +   static_assert(sequence_length >= 1 && sequence_length <= 4); -    return value; +   if constexpr(sequence_length > 1) { +    if (remaining_code_units() < sequence_length) +     throw std::invalid_argument("Bad input: Not enough bytes left for decoding UTF-8 sequence");     } + +   if (is_utf8_sequence(bytes...)) { +    std::advance(iterator, sequence_length); +    internal_type result{decode_utf8_sequence(bytes...)}; +    if (!unicode::is_valid_unicode<sequence_length * 6>(result)) +     throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result))); +    return result; +   } else { +    if constexpr(sequence_length <= 3) // template recursion break condition: UTF-8 has 1..4 code units +     return calculate_utf8_value(bytes..., static_cast<utf8_t>(get_code_unit<sequence_length>())); +    else +     throw std::invalid_argument("Bad UTF-8 input: Invalid 4 byte sequence"); +   } +  } + +  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true> +  inline internal_type calculate_value() +  { +   return calculate_utf8_value(static_cast<utf8_t>(get_code_unit<0>()));    }    template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true> @@ -380,24 +371,13 @@ namespace unicode::detail {      return trailing_byte<m - n - 1>(value);    } -  template<typename Arg> -  inline void append(Arg&& arg) -  { -   if constexpr (std::is_same<Container, typename std::basic_string<value_type>>::value) { -    s.append({arg}); -   } else { -    s.emplace_back(arg); -   } -  } - -  template<typename Arg, typename... Args> -  inline void append(Arg&& arg, Args&&... args) +  template<typename... Args> +  inline void append(Args&&... args)    {     if constexpr (std::is_same<Container, typename std::basic_string<value_type>>::value) { -    s.append({arg, args...}); +    s.append({args...});     } else { -    s.emplace_back(arg); -    append(args...); +    (s.emplace_back(args), ...);     }    } | 
