Optimize UTF validation

author: Roland Reichwein <mail@reichwein.it> 2021-12-28 12:46:30 +0100
committer: Roland Reichwein <mail@reichwein.it> 2021-12-28 12:46:30 +0100
commit: 403c885d67f79c637ebcb303722adfd6a4b8195e (patch)
tree: d8f40c674a5c65176e028a1c7bb9122baa2e7756
parent: 970ba4111160fbf78351b21a024c46c0978e0440 (diff)
3 files changed, 126 insertions, 51 deletions
diff --git a/debian/control b/debian/control
index a06886a..fcc0185 100644
--- a/debian/control
+++ b/debian/control
@@ -15,10 +15,11 @@ Description: Unicode conversion library
  UTF-8, UTF-16 and UTF-32.
  .
   Features:
+  - Tested on Debian 10+11, Ubuntu 2004 to 2204
+  - C++17 and C++20 compatible
   - Additional support for ISO-8859-1 encoding (Latin-1) as subset of Unicode
   - Additional support for ISO-8859-15
-  - Tested on Debian 10+11, Ubuntu 2004 to 2110
-  - C++17 and C++20 compatible
+  - Header only
 
 Package: unicode-tools
 Architecture: any
diff --git a/include/unicode.h b/include/unicode.h
index 4064233..be91d77 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -47,12 +47,6 @@ namespace unicode::detail {
 
  using namespace std::string_literals;
 
- template<typename value_type>
- inline bool is_utf8_followup_byte(value_type b) noexcept
- {
-  return (b & 0b11000000) == 0b10000000;
- }
-
  template<size_t sequence_length, typename value_type>
  inline bool is_utf8_leading_byte(value_type byte) noexcept
  {
@@ -65,22 +59,26 @@ namespace unicode::detail {
   }
  }
 
+ template<typename value_type>
+ inline bool is_utf8_followup_byte(value_type b) noexcept
+ {
+  return (b & 0b11000000) == 0b10000000;
+ }
+
  template<typename value_type, typename... Tbytes>
  inline bool is_utf8_sequence(value_type byte0, Tbytes... bytes) noexcept
  {
   constexpr auto n{sizeof...(Tbytes) + 1};
 
-  static_assert(n <= 4);
+  static_assert(n <= 4, "UTF-8 sequences of 1 through 4 code units are supported");
 
   return is_utf8_leading_byte<n>(byte0) &&
-         (is_utf8_followup_byte(bytes) && ...);
+         (... && is_utf8_followup_byte(bytes)); // left fold for linear evaluation from left to right
  }
 
- template<typename T>
- inline bool validate_utf8(const std::basic_string<T>& s)
+ template<typename T, typename std::enable_if_t<(sizeof(T) == 1), bool> = true>
+ inline bool validate_utf(const std::basic_string<T>& s)
  {
-  static_assert(sizeof(T) == 1);
-
   int i{};
   auto size{s.size()};
   while (i < size) {
@@ -103,6 +101,48 @@ namespace unicode::detail {
   return true;
  }
 
+ template<typename value_type, typename... Twords>
+ inline bool is_utf16_sequence(value_type word0, Twords... words) noexcept
+ {
+  constexpr auto n{sizeof...(Twords) + 1};
+
+  static_assert(n <= 2, "UTF-16 sequences of only 1 or 2 code units are supported");
+
+  if constexpr(n == 1) {
+   return is_valid_unicode(word0);
+  } else {
+   char16_t unit0 {static_cast<char16_t>(word0)};
+   char16_t unit1 {static_cast<char16_t>((words, ...))};
+   return (unit0 & 0xFC00) == 0xD800 && (unit1 & 0xFC00) == 0xDC00;
+  }
+ }
+
+ template<typename T, typename std::enable_if_t<(sizeof(T) == 2), bool> = true>
+ inline bool validate_utf(const std::basic_string<T>& s)
+ {
+  int i{};
+  auto size{s.size()};
+  while (i < size) {
+   if (is_utf16_sequence(s[i])) {
+    i++;
+   } else if ((i < size - 1) && is_utf16_sequence(s[i], s[i + 1])) {
+    i += 2;
+   } else {
+    return false;
+   }
+  }
+  return true;
+ }
+
+ template<typename T, typename std::enable_if_t<(sizeof(T) == 4), bool> = true>
+ inline bool validate_utf(const std::basic_string<T>& s)
+ {
+  for (auto i: s)
+   if (!is_valid_unicode(i))
+    return false;
+  return true;
+ }
+
  template<typename value_type>
  inline char32_t continuation_value(value_type b) noexcept
  {
@@ -160,7 +200,7 @@ namespace unicode::detail {
    }
   }
 
-  template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
+  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true>
   inline internal_type calculate_value()
   {
    utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};
@@ -201,7 +241,7 @@ namespace unicode::detail {
    }
   }
 
-  template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
+  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true>
   inline internal_type calculate_value()
   {
    char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())};
@@ -222,7 +262,7 @@ namespace unicode::detail {
    }
   }
 
-  template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
+  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true>
   inline internal_type calculate_value()
   {
    internal_type result {static_cast<internal_type>(get_code_unit<0>())};
@@ -348,7 +388,7 @@ namespace unicode::detail {
    }
   }
 
-  template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
+  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true>
   inline void append_utf(const internal_type& value)
   {
    if (value < 0x80) { // 1 byte
@@ -363,7 +403,7 @@ namespace unicode::detail {
     throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast<uint32_t>(value)));
   }
 
-  template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
+  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true>
   inline void append_utf(const internal_type& value)
   {
    if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator)
@@ -374,7 +414,7 @@ namespace unicode::detail {
    }
   }
 
-  template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
+  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true>
   inline void append_utf(const internal_type& value)
   {
    // expect value to be already valid Unicode values (checked in input iterator)
@@ -741,12 +781,12 @@ namespace unicode {
  template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true>
  typename To::string_type convert(const typename From::string_type& s)
  {
-  if constexpr(sizeof(typename From::value_type) == 1 && sizeof(typename To::value_type) == 1 && std::is_same_v<From, UTF_8> && std::is_same_v<To, UTF_8>) {
-   if (validate_utf8<typename From::value_type>(s)) {
-    if constexpr (std::is_same_v<typename From::value_type, typename To::value_type>)
-     return s;
-    else
-     return typename To::string_type{s.begin(), s.end()};
+  // if input type == output type, only validate and return input, is appropriate
+  if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) == 1 &&
+               std::is_same_v<From, UTF<utf_iterator<typename From::value_type>, utf_back_insert_iterator<typename From::value_type>>> &&
+               std::is_same_v<To, UTF<utf_iterator<typename To::value_type>, utf_back_insert_iterator<typename To::value_type>>>) {
+   if (validate_utf<typename From::value_type>(s)) {
+    return s;
    } else {
     throw std::invalid_argument("Invalid UTF-8");
    }
@@ -848,12 +888,7 @@ namespace unicode {
  template<typename Facet, std::enable_if_t<std::is_empty<Facet>::value, bool> = true>
  bool is_valid_utf(const typename Facet::string_type& s)
  {
-  try {
-   std::for_each(Facet::begin(s), Facet::end(s), [](const char32_t& c){});
-  } catch (const std::invalid_argument&) {
-   return false;
-  }
-  return true;
+  return validate_utf<typename Facet::value_type>(s);
  }
 
 } // namespace unicode
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index 2675989..99e164b 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -283,6 +283,9 @@ void test_utf_to_utf(std::tuple<Ts...>& t)
  // test facet interface
  result = unicode::convert<typename unicode::Encoding<typename From::value_type>::Facet, typename unicode::Encoding<typename To::value_type>::Facet>(std::get<i>(t));
  BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "Facet: From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result);
+
+ // test actual results by comparing with boost::locale::conv results
+ BOOST_CHECK_EQUAL(result, (boost::locale::conv::utf_to_utf<typename To::value_type, typename From::value_type>(std::get<i>(t))));
  
  // iterate over other combinations
  if constexpr (i + 1 < std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value)
@@ -650,26 +653,10 @@ BOOST_AUTO_TEST_CASE(convert)
 
  BOOST_CHECK((unicode::convert<char, char32_t>("äöü")) == std::u32string{U"äöü"});
 
-#ifdef _WIN32
- BOOST_CHECK(sizeof(wchar_t) == 2);
-#else // Unix like
- BOOST_CHECK(sizeof(wchar_t) == 4);
-#endif
-
- // For the following checks, wchar_t size and encoding is system dependent:
- // Windows: UTF-16
- // Linux: UTF-32
- BOOST_CHECK((unicode::convert<char, wchar_t>("äöü")) == std::wstring{L"äöü"});
- BOOST_CHECK((unicode::convert<char, wchar_t>("\u732b")) == std::wstring{L"\u732b"});
- BOOST_CHECK((unicode::convert<char, wchar_t>("\U0001F63A")) == std::wstring{L"\U0001F63A"});
- BOOST_CHECK((unicode::convert<wchar_t, char32_t>(L"\U0001F63A")) == std::u32string{U"\U0001F63A"});
- BOOST_CHECK((unicode::convert<wchar_t, utf8_t>(L"\U0001F63A")) == std::basic_string<utf8_t>{(utf8_t*)"\U0001F63A"});
+ // vector
+ BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<char16_t>>(std::vector<char>{})) == std::vector<char16_t>{});
+ BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<char16_t>>(std::vector<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector<char16_t>{u'ä', u'ö', u'ü'}));
 
- BOOST_CHECK((unicode::convert<std::string, std::wstring>(std::string{"äöü"})) == std::wstring{L"äöü"});
- 
- BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{})) == std::vector<wchar_t>{});
- BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector<wchar_t>{L'ä', L'ö', L'ü'}));
- 
  // deque
  BOOST_CHECK((unicode::convert<std::deque<char>, std::deque<wchar_t>>(std::deque<char>{})) == std::deque<wchar_t>{});
  BOOST_CHECK((unicode::convert<std::deque<char>, std::deque<wchar_t>>(std::deque<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::deque<wchar_t>{L'ä', L'ö', L'ü'}));
@@ -703,6 +690,58 @@ BOOST_AUTO_TEST_CASE(convert)
  BOOST_CHECK((unicode::convert<std::array<uint8_t, 6>, std::list<uint16_t>>(std::array<uint8_t, 6>{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::list<uint16_t>{L'ä', L'ö', L'ü'}));
 }
 
+// wchar_t specific tests: system dependent
+BOOST_AUTO_TEST_CASE(convert_wstring)
+{
+#ifdef _WIN32
+ BOOST_CHECK(sizeof(wchar_t) == 2);
+#else // Unix like
+ BOOST_CHECK(sizeof(wchar_t) == 4);
+#endif
+
+ // For the following checks, wchar_t size and encoding is system dependent:
+ // Windows: UTF-16
+ // Linux: UTF-32
+ BOOST_CHECK((unicode::convert<char, wchar_t>("äöü")) == std::wstring{L"äöü"});
+ BOOST_CHECK((unicode::convert<char, wchar_t>("\u732b")) == std::wstring{L"\u732b"});
+ BOOST_CHECK((unicode::convert<char, wchar_t>("\U0001F63A")) == std::wstring{L"\U0001F63A"});
+ BOOST_CHECK((unicode::convert<wchar_t, char32_t>(L"\U0001F63A")) == std::u32string{U"\U0001F63A"});
+ BOOST_CHECK((unicode::convert<wchar_t, utf8_t>(L"\U0001F63A")) == std::basic_string<utf8_t>{(utf8_t*)"\U0001F63A"});
+
+ BOOST_CHECK((unicode::convert<std::string, std::wstring>(std::string{"äöü"})) == std::wstring{L"äöü"});
+ 
+ BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{})) == std::vector<wchar_t>{});
+ BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector<wchar_t>{L'ä', L'ö', L'ü'}));
+ 
+ std::u16string u16_value{u"\U0001F63A"};
+ std::u32string u32_value{U"\U0001F63A"};
+ std::wstring w_value{L"\U0001F63A"};
+
+ std::u16string result_u16_value{unicode::convert<std::wstring, std::u16string>(w_value)};
+ std::u32string result_u32_value{unicode::convert<std::wstring, std::u32string>(w_value)};
+ std::wstring result_w_value_1{unicode::convert<std::u16string, std::wstring>(u16_value)};
+ std::wstring result_w_value_2{unicode::convert<std::u32string, std::wstring>(u32_value)};
+
+ BOOST_CHECK_EQUAL(u16_value.size(), 2);
+ BOOST_CHECK_EQUAL(u32_value.size(), 1);
+ BOOST_CHECK_EQUAL(result_u16_value.size(), 2);
+ BOOST_CHECK_EQUAL(result_u32_value.size(), 1);
+ BOOST_CHECK_EQUAL(u16_value, result_u16_value);
+ BOOST_CHECK_EQUAL(u32_value, result_u32_value);
+ BOOST_CHECK(w_value == result_w_value_1);
+ BOOST_CHECK(w_value == result_w_value_2);
+#ifdef _WIN32
+ BOOST_CHECK_EQUAL(w_value.size(), 2);
+ BOOST_CHECK_EQUAL(result_w_value_1.size(), 2);
+ BOOST_CHECK_EQUAL(result_w_value_2.size(), 2);
+#else // Unix like
+ BOOST_CHECK_EQUAL(w_value.size(), 1);
+ BOOST_CHECK_EQUAL(result_w_value_1.size(), 1);
+ BOOST_CHECK_EQUAL(result_w_value_2.size(), 1);
+#endif
+
+}
+
 BOOST_AUTO_TEST_CASE(is_valid_utf)
 {
  BOOST_CHECK(unicode::is_valid_utf<char16_t>(u"äöü"));
author	Roland Reichwein <mail@reichwein.it>	2021-12-28 12:46:30 +0100
committer	Roland Reichwein <mail@reichwein.it>	2021-12-28 12:46:30 +0100
commit	403c885d67f79c637ebcb303722adfd6a4b8195e (patch)
tree	d8f40c674a5c65176e028a1c7bb9122baa2e7756
parent	970ba4111160fbf78351b21a024c46c0978e0440 (diff)