diff options
| -rw-r--r-- | debian/control | 5 | ||||
| -rw-r--r-- | include/unicode.h | 95 | ||||
| -rw-r--r-- | src/test-unicode.cpp | 77 | 
3 files changed, 126 insertions, 51 deletions
| diff --git a/debian/control b/debian/control index a06886a..fcc0185 100644 --- a/debian/control +++ b/debian/control @@ -15,10 +15,11 @@ Description: Unicode conversion library   UTF-8, UTF-16 and UTF-32.   .    Features: +  - Tested on Debian 10+11, Ubuntu 2004 to 2204 +  - C++17 and C++20 compatible    - Additional support for ISO-8859-1 encoding (Latin-1) as subset of Unicode    - Additional support for ISO-8859-15 -  - Tested on Debian 10+11, Ubuntu 2004 to 2110 -  - C++17 and C++20 compatible +  - Header only  Package: unicode-tools  Architecture: any diff --git a/include/unicode.h b/include/unicode.h index 4064233..be91d77 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -47,12 +47,6 @@ namespace unicode::detail {   using namespace std::string_literals; - template<typename value_type> - inline bool is_utf8_followup_byte(value_type b) noexcept - { -  return (b & 0b11000000) == 0b10000000; - } -   template<size_t sequence_length, typename value_type>   inline bool is_utf8_leading_byte(value_type byte) noexcept   { @@ -65,22 +59,26 @@ namespace unicode::detail {    }   } + template<typename value_type> + inline bool is_utf8_followup_byte(value_type b) noexcept + { +  return (b & 0b11000000) == 0b10000000; + } +   template<typename value_type, typename... Tbytes>   inline bool is_utf8_sequence(value_type byte0, Tbytes... bytes) noexcept   {    constexpr auto n{sizeof...(Tbytes) + 1}; -  static_assert(n <= 4); +  static_assert(n <= 4, "UTF-8 sequences of 1 through 4 code units are supported");    return is_utf8_leading_byte<n>(byte0) && -         (is_utf8_followup_byte(bytes) && ...); +         (... && is_utf8_followup_byte(bytes)); // left fold for linear evaluation from left to right   } - template<typename T> - inline bool validate_utf8(const std::basic_string<T>& s) + template<typename T, typename std::enable_if_t<(sizeof(T) == 1), bool> = true> + inline bool validate_utf(const std::basic_string<T>& s)   { -  static_assert(sizeof(T) == 1); -    int i{};    auto size{s.size()};    while (i < size) { @@ -103,6 +101,48 @@ namespace unicode::detail {    return true;   } + template<typename value_type, typename... Twords> + inline bool is_utf16_sequence(value_type word0, Twords... words) noexcept + { +  constexpr auto n{sizeof...(Twords) + 1}; + +  static_assert(n <= 2, "UTF-16 sequences of only 1 or 2 code units are supported"); + +  if constexpr(n == 1) { +   return is_valid_unicode(word0); +  } else { +   char16_t unit0 {static_cast<char16_t>(word0)}; +   char16_t unit1 {static_cast<char16_t>((words, ...))}; +   return (unit0 & 0xFC00) == 0xD800 && (unit1 & 0xFC00) == 0xDC00; +  } + } + + template<typename T, typename std::enable_if_t<(sizeof(T) == 2), bool> = true> + inline bool validate_utf(const std::basic_string<T>& s) + { +  int i{}; +  auto size{s.size()}; +  while (i < size) { +   if (is_utf16_sequence(s[i])) { +    i++; +   } else if ((i < size - 1) && is_utf16_sequence(s[i], s[i + 1])) { +    i += 2; +   } else { +    return false; +   } +  } +  return true; + } + + template<typename T, typename std::enable_if_t<(sizeof(T) == 4), bool> = true> + inline bool validate_utf(const std::basic_string<T>& s) + { +  for (auto i: s) +   if (!is_valid_unicode(i)) +    return false; +  return true; + } +   template<typename value_type>   inline char32_t continuation_value(value_type b) noexcept   { @@ -160,7 +200,7 @@ namespace unicode::detail {     }    } -  template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true> +  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true>    inline internal_type calculate_value()    {     utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())}; @@ -201,7 +241,7 @@ namespace unicode::detail {     }    } -  template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true> +  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true>    inline internal_type calculate_value()    {     char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())}; @@ -222,7 +262,7 @@ namespace unicode::detail {     }    } -  template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true> +  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true>    inline internal_type calculate_value()    {     internal_type result {static_cast<internal_type>(get_code_unit<0>())}; @@ -348,7 +388,7 @@ namespace unicode::detail {     }    } -  template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true> +  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true>    inline void append_utf(const internal_type& value)    {     if (value < 0x80) { // 1 byte @@ -363,7 +403,7 @@ namespace unicode::detail {      throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast<uint32_t>(value)));    } -  template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true> +  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true>    inline void append_utf(const internal_type& value)    {     if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator) @@ -374,7 +414,7 @@ namespace unicode::detail {     }    } -  template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true> +  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true>    inline void append_utf(const internal_type& value)    {     // expect value to be already valid Unicode values (checked in input iterator) @@ -741,12 +781,12 @@ namespace unicode {   template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true>   typename To::string_type convert(const typename From::string_type& s)   { -  if constexpr(sizeof(typename From::value_type) == 1 && sizeof(typename To::value_type) == 1 && std::is_same_v<From, UTF_8> && std::is_same_v<To, UTF_8>) { -   if (validate_utf8<typename From::value_type>(s)) { -    if constexpr (std::is_same_v<typename From::value_type, typename To::value_type>) -     return s; -    else -     return typename To::string_type{s.begin(), s.end()}; +  // if input type == output type, only validate and return input, is appropriate +  if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) == 1 && +               std::is_same_v<From, UTF<utf_iterator<typename From::value_type>, utf_back_insert_iterator<typename From::value_type>>> && +               std::is_same_v<To, UTF<utf_iterator<typename To::value_type>, utf_back_insert_iterator<typename To::value_type>>>) { +   if (validate_utf<typename From::value_type>(s)) { +    return s;     } else {      throw std::invalid_argument("Invalid UTF-8");     } @@ -848,12 +888,7 @@ namespace unicode {   template<typename Facet, std::enable_if_t<std::is_empty<Facet>::value, bool> = true>   bool is_valid_utf(const typename Facet::string_type& s)   { -  try { -   std::for_each(Facet::begin(s), Facet::end(s), [](const char32_t& c){}); -  } catch (const std::invalid_argument&) { -   return false; -  } -  return true; +  return validate_utf<typename Facet::value_type>(s);   }  } // namespace unicode diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index 2675989..99e164b 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -283,6 +283,9 @@ void test_utf_to_utf(std::tuple<Ts...>& t)   // test facet interface   result = unicode::convert<typename unicode::Encoding<typename From::value_type>::Facet, typename unicode::Encoding<typename To::value_type>::Facet>(std::get<i>(t));   BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "Facet: From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result); + + // test actual results by comparing with boost::locale::conv results + BOOST_CHECK_EQUAL(result, (boost::locale::conv::utf_to_utf<typename To::value_type, typename From::value_type>(std::get<i>(t))));   // iterate over other combinations   if constexpr (i + 1 < std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value) @@ -650,26 +653,10 @@ BOOST_AUTO_TEST_CASE(convert)   BOOST_CHECK((unicode::convert<char, char32_t>("äöü")) == std::u32string{U"äöü"}); -#ifdef _WIN32 - BOOST_CHECK(sizeof(wchar_t) == 2); -#else // Unix like - BOOST_CHECK(sizeof(wchar_t) == 4); -#endif - - // For the following checks, wchar_t size and encoding is system dependent: - // Windows: UTF-16 - // Linux: UTF-32 - BOOST_CHECK((unicode::convert<char, wchar_t>("äöü")) == std::wstring{L"äöü"}); - BOOST_CHECK((unicode::convert<char, wchar_t>("\u732b")) == std::wstring{L"\u732b"}); - BOOST_CHECK((unicode::convert<char, wchar_t>("\U0001F63A")) == std::wstring{L"\U0001F63A"}); - BOOST_CHECK((unicode::convert<wchar_t, char32_t>(L"\U0001F63A")) == std::u32string{U"\U0001F63A"}); - BOOST_CHECK((unicode::convert<wchar_t, utf8_t>(L"\U0001F63A")) == std::basic_string<utf8_t>{(utf8_t*)"\U0001F63A"}); + // vector + BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<char16_t>>(std::vector<char>{})) == std::vector<char16_t>{}); + BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<char16_t>>(std::vector<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector<char16_t>{u'ä', u'ö', u'ü'})); - BOOST_CHECK((unicode::convert<std::string, std::wstring>(std::string{"äöü"})) == std::wstring{L"äöü"}); -  - BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{})) == std::vector<wchar_t>{}); - BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector<wchar_t>{L'ä', L'ö', L'ü'})); -    // deque   BOOST_CHECK((unicode::convert<std::deque<char>, std::deque<wchar_t>>(std::deque<char>{})) == std::deque<wchar_t>{});   BOOST_CHECK((unicode::convert<std::deque<char>, std::deque<wchar_t>>(std::deque<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::deque<wchar_t>{L'ä', L'ö', L'ü'})); @@ -703,6 +690,58 @@ BOOST_AUTO_TEST_CASE(convert)   BOOST_CHECK((unicode::convert<std::array<uint8_t, 6>, std::list<uint16_t>>(std::array<uint8_t, 6>{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::list<uint16_t>{L'ä', L'ö', L'ü'}));  } +// wchar_t specific tests: system dependent +BOOST_AUTO_TEST_CASE(convert_wstring) +{ +#ifdef _WIN32 + BOOST_CHECK(sizeof(wchar_t) == 2); +#else // Unix like + BOOST_CHECK(sizeof(wchar_t) == 4); +#endif + + // For the following checks, wchar_t size and encoding is system dependent: + // Windows: UTF-16 + // Linux: UTF-32 + BOOST_CHECK((unicode::convert<char, wchar_t>("äöü")) == std::wstring{L"äöü"}); + BOOST_CHECK((unicode::convert<char, wchar_t>("\u732b")) == std::wstring{L"\u732b"}); + BOOST_CHECK((unicode::convert<char, wchar_t>("\U0001F63A")) == std::wstring{L"\U0001F63A"}); + BOOST_CHECK((unicode::convert<wchar_t, char32_t>(L"\U0001F63A")) == std::u32string{U"\U0001F63A"}); + BOOST_CHECK((unicode::convert<wchar_t, utf8_t>(L"\U0001F63A")) == std::basic_string<utf8_t>{(utf8_t*)"\U0001F63A"}); + + BOOST_CHECK((unicode::convert<std::string, std::wstring>(std::string{"äöü"})) == std::wstring{L"äöü"}); +  + BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{})) == std::vector<wchar_t>{}); + BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector<wchar_t>{L'ä', L'ö', L'ü'})); +  + std::u16string u16_value{u"\U0001F63A"}; + std::u32string u32_value{U"\U0001F63A"}; + std::wstring w_value{L"\U0001F63A"}; + + std::u16string result_u16_value{unicode::convert<std::wstring, std::u16string>(w_value)}; + std::u32string result_u32_value{unicode::convert<std::wstring, std::u32string>(w_value)}; + std::wstring result_w_value_1{unicode::convert<std::u16string, std::wstring>(u16_value)}; + std::wstring result_w_value_2{unicode::convert<std::u32string, std::wstring>(u32_value)}; + + BOOST_CHECK_EQUAL(u16_value.size(), 2); + BOOST_CHECK_EQUAL(u32_value.size(), 1); + BOOST_CHECK_EQUAL(result_u16_value.size(), 2); + BOOST_CHECK_EQUAL(result_u32_value.size(), 1); + BOOST_CHECK_EQUAL(u16_value, result_u16_value); + BOOST_CHECK_EQUAL(u32_value, result_u32_value); + BOOST_CHECK(w_value == result_w_value_1); + BOOST_CHECK(w_value == result_w_value_2); +#ifdef _WIN32 + BOOST_CHECK_EQUAL(w_value.size(), 2); + BOOST_CHECK_EQUAL(result_w_value_1.size(), 2); + BOOST_CHECK_EQUAL(result_w_value_2.size(), 2); +#else // Unix like + BOOST_CHECK_EQUAL(w_value.size(), 1); + BOOST_CHECK_EQUAL(result_w_value_1.size(), 1); + BOOST_CHECK_EQUAL(result_w_value_2.size(), 1); +#endif + +} +  BOOST_AUTO_TEST_CASE(is_valid_utf)  {   BOOST_CHECK(unicode::is_valid_utf<char16_t>(u"äöü")); | 
