Simplify utf_iterator for input, build on Debian 10+11, Ubuntu 2004-2204

author: Roland Reichwein <mail@reichwein.it> 2021-12-19 19:51:38 +0100
committer: Roland Reichwein <mail@reichwein.it> 2021-12-19 19:51:38 +0100
commit: 9dc97269201603dd479e15a736a64479a5095556 (patch)
tree: b5f215bf9cfbbf2bee092505f4fdfbf3e4501b7b
parent: e24a0d5d371d0916dbfb375d3ea404f7e6237c74 (diff)
4 files changed, 70 insertions, 63 deletions
diff --git a/Makefile b/Makefile
index e8a8f29..346f8a0 100644
--- a/Makefile
+++ b/Makefile
@@ -1,38 +1,56 @@
 PROJECTNAME=unicode
 VERSION=$(shell dpkg-parsechangelog --show-field Version)
+ifeq ($(shell lsb_release -si),Debian)
+ONDEBIAN=yes
+else
+ONDEBIAN=no
+endif
 
-DISTROS=base debian11 ubuntu2110
+# On Ubuntu 2104 and 2110, dh_strip / debugedit is broken, therefore different Non-Debian options in the following
+DISTROS=base debian10 debian11 ubuntu2004 ubuntu2010 ubuntu2104 ubuntu2110 ubuntu2204
 
 ifeq ($(wildcard $(shell which clang++-13)),)
+ifeq ($(wildcard $(shell which clang++-12)),)
+ifeq ($(wildcard $(shell which clang++-11)),)
 ifeq ($(wildcard $(shell which clang++)),)
 CXX=g++-11
 else
 CXX=clang++
 endif
 else
+CXX=clang++-11
+endif
+else
+CXX=clang++-12
+endif
+else
 CXX=clang++-13
 endif
 
-# boost is buggy for C++20: error: static_assert failed due to requirement 'detail::is_endian_reversible_inplace<char8_t>
-#STANDARD=c++17
-STANDARD=c++20
+STANDARD=c++17
 
 ifeq ($(CXXFLAGS),)
 #CXXFLAGS=-O0 -g -D_DEBUG
 CXXFLAGS=-O2 -DNDEBUG
 endif
 
-CXXFLAGS+=-Wall -Iinclude -std=$(STANDARD)
-
 ifeq ($(CXX),clang++-13)
+ifeq ($(ONDEBIAN),yes)
 COMPILER_SUITE=clang
 LIBS+=-fuse-ld=lld-13
+# boost is buggy for C++20: error: static_assert failed due to requirement 'detail::is_endian_reversible_inplace<char8_t>
+STANDARD=c++20
+endif
 endif
 
 ifeq ($(CXX),clang++)
+ifeq ($(ONDEBIAN),yes)
 COMPILER_SUITE=clang
 LIBS+=-fuse-ld=lld
 endif
+endif
+
+CXXFLAGS+=-Wall -Iinclude -std=$(STANDARD)
 
 LDLIBS+=\
 -lboost_context \
diff --git a/debian/control b/debian/control
index 933d5f8..0c236a3 100644
--- a/debian/control
+++ b/debian/control
@@ -2,7 +2,7 @@ Source: unicode
 Section: devel
 Priority: optional
 Maintainer: Roland Reichwein <mail@reichwein.it>
-Build-Depends: debhelper (>= 12), libboost-all-dev | libboost1.74-all-dev, libc++-dev | libc++-13-dev | libc++-11-dev, libc++abi-dev | libc++abi-13-dev | libc++abi-11-dev, lld | lld-13 | lld-11, clang | clang-13 | clang-11
+Build-Depends: debhelper (>= 12), libboost1.74-all-dev | libboost-all-dev, libc++-13-dev | libc++-12-dev | libc++-11-dev | libc++-dev, libc++abi-13-dev | libc++abi-12-dev | libc++abi-11-dev | libc++abi-dev, lld-13 | lld-12 | lld-11 | lld, clang-13 | clang-12 | clang-11 | clang, libunwind-13-dev | libunwind-12-dev | libunwind-dev, llvm-13-linker-tools | llvm-12-linker-tools | llvm-11-linker-tools | clang
 Standards-Version: 4.5.0
 Homepage: http://www.reichwein.it/unicode/
 
@@ -17,7 +17,7 @@ Description: Unicode conversion library
   Features:
   - Additional support for ISO-8859-1 encoding (Latin-1) as subset of Unicode
   - Additional support for ISO-8859-15
-  - Tested on Debian 11, Debian 10, Ubuntu 2004, Ubuntu 2010
+  - Tested on Debian 10+11, Ubuntu 2004 to 2110
   - C++17 and C++20 compatible
 
 Package: unicode-tools
diff --git a/include/unicode.h b/include/unicode.h
index 6b6f21a..6d8aac5 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -28,7 +28,7 @@ namespace unicode {
 
  // usually, char32_t, uint32_t etc.
  template<typename T>
- static inline bool is_valid_unicode(const T& value)
+ static inline bool is_valid_unicode(const T& value) noexcept
  {
    return value <= 0xD7FF || (value >= 0xE000 && value <= 0x10FFFF);
  }
@@ -55,19 +55,18 @@ namespace unicode::detail {
   utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend):
    iterator(cbegin), end_iterator(cend)
   {
-   calculate_value();
   }
 
   utf_iterator(const utf_iterator& other) = default;
   utf_iterator& operator=(const utf_iterator& other) = default;
 
-  size_t remaining_code_units() const
+  size_t remaining_code_units() const noexcept
   {
    return std::distance(iterator, end_iterator);
   }
 
   template<size_t index>
-  T get_code_unit() const
+  T get_code_unit() const noexcept
   {
    if constexpr (std::is_same<Container, typename std::list<T>>::value) {
     // std::list doesn't support it + n
@@ -79,46 +78,49 @@ namespace unicode::detail {
    }
   }
 
-  inline static bool is_continuation_byte(T b)
+  inline static bool is_continuation_byte(T b) noexcept
   {
    return (b & 0b11000000) == 0b10000000;
   }
 
   template<typename... Targs>
-  inline static bool is_continuation_byte(T b, Targs... Fargs)
+  inline static bool is_continuation_byte(T b, Targs... Fargs) noexcept
   {
    return is_continuation_byte(b) && is_continuation_byte(Fargs...);
   }
 
   template<size_t n>
-  inline static bool is_byte0_of(T b)
+  inline static bool is_byte0_of(T b) noexcept
   {
    return (b & static_cast<T>(0xFF << (7 - n))) == static_cast<T>(0xFF << (8 - n));
   }
 
-  inline static char32_t continuation_value(T b)
+  inline static char32_t continuation_value(T b) noexcept
   {
    return static_cast<char32_t>(b & 0b00111111);
   }
 
   template<typename... Targs>
-  inline static char32_t continuation_value(T b, Targs... Fargs)
+  inline static char32_t continuation_value(T b, Targs... Fargs) noexcept
   {
    return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...);
   }
 
   template<size_t n>
-  inline static char32_t value_byte0_of(T b)
+  inline static char32_t value_byte0_of(T b) noexcept
   {
    return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6);
   }
 
-  void calculate_value_utf8()
+  template<class X = T, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
+  inline value_type calculate_value()
   {
    size_t remaining{remaining_code_units()};
    
    if (!remaining)
-    return;
+    return {};
+
+   value_type value{};
 
    utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};
    if (byte0 & 0x80) { // 2-4 bytes
@@ -126,17 +128,17 @@ namespace unicode::detail {
      utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())};
      if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes
       value = value_byte0_of<2>(byte0) | continuation_value(byte1);
-      sequence_length = 2;
+      std::advance(iterator, 2);
      } else if (remaining >= 3) {
       utf8_t byte2 {static_cast<utf8_t>(get_code_unit<2>())};
       if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes
        value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2);
-       sequence_length = 3;
+       std::advance(iterator, 3);
       } else if (remaining >= 4) {
        utf8_t byte3 {static_cast<utf8_t>(get_code_unit<3>())};
        if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes
         value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3);
-        sequence_length = 4;
+        std::advance(iterator, 4);
        } else
         throw std::invalid_argument("Bad input: Invalid 4 byte sequence");
       } else
@@ -152,22 +154,25 @@ namespace unicode::detail {
 
    } else { // 1 byte: 7 bit ASCII
     value = byte0;
-    sequence_length = 1;
+    std::advance(iterator, 1);
    }
+
+   return value;
   }
 
-  void calculate_value_utf16()
+  template<class X = T, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
+  inline value_type calculate_value()
   {
    size_t remaining{remaining_code_units()};
    
    if (!remaining)
-    return;
+    return {};
 
    char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())};
 
    if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane)
-    value = unit0;
-    sequence_length = 1;
+    std::advance(iterator, 1);
+    return unit0;
    } else {
     if (remaining < 2)
      throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing");
@@ -176,45 +181,32 @@ namespace unicode::detail {
     if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00)
      throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates");
 
-    value = (static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000;
-    sequence_length = 2;
+    std::advance(iterator, 2);
+    return (static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000;
    }
   }
 
-  void calculate_value_utf32()
+  template<class X = T, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
+  inline value_type calculate_value()
   {
    size_t remaining{remaining_code_units()};
 
    if (!remaining)
-    return;
+    return {};
 
-   value = static_cast<char32_t>(get_code_unit<0>());
-   
-   if (!unicode::is_valid_unicode(value))
-    throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
+   value_type result {static_cast<char32_t>(get_code_unit<0>())};
 
-   sequence_length = 1;
-  }
+   if (!unicode::is_valid_unicode(result))
+    throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result)));
 
-  // set value member
-  void calculate_value()
-  {
-   static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
+   std::advance(iterator, 1);
 
-   if constexpr(sizeof(T) == 1) {
-    calculate_value_utf8();
-   } else if constexpr (sizeof(T) == 2) {
-    calculate_value_utf16();
-   } else if constexpr (sizeof(T) == 4) {
-    calculate_value_utf32();
-   }
+   return result;
   }
 
   // pre-increment
   utf_iterator& operator++()
   {
-   std::advance(iterator, sequence_length);
-   calculate_value();
    return *this;
   }
 
@@ -223,17 +215,14 @@ namespace unicode::detail {
    return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator);
   }
 
-  reference operator*()
+  value_type operator*()
   {
-   return value;
+   return calculate_value();
   }
 
  private:
   typename string_type::const_iterator iterator;
   typename string_type::const_iterator end_iterator;
-
-  char32_t value{}; // always save complete unicode code point at this point
-  size_t sequence_length{};
  };
 
  template<typename T, typename Container=std::basic_string<T>>
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index 29e5c2e..d00a33d 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -372,7 +372,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid)
  std::transform(u32list.begin(), u32list.end(), std::back_inserter(u16list), [](const std::u32string& s){return unicode::convert<unicode::UTF_32, unicode::UTF_16>(s);});
 
  // Fill UTF-8 data list
- std::vector<std::u8string> u8list;
+ std::vector<std::basic_string<utf8_t>> u8list;
  std::transform(u32list.begin(), u32list.end(), std::back_inserter(u8list), [](const std::u32string& s){return unicode::convert<unicode::UTF_32, unicode::UTF_8>(s);});
 
  for (const auto& i : u32list) {
@@ -380,7 +380,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid)
   BOOST_CHECK(s32.size() == i.size());
   std::u16string s16{unicode::convert<unicode::UTF_32, unicode::UTF_16>(i)};
   BOOST_CHECK(s16.size() >= i.size());
-  std::u8string s8{unicode::convert<unicode::UTF_32, unicode::UTF_8>(i)};
+  std::basic_string<utf8_t> s8{unicode::convert<unicode::UTF_32, unicode::UTF_8>(i)};
   BOOST_CHECK(s8.size() >= i.size());
  }
 
@@ -389,7 +389,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid)
   BOOST_CHECK(s32.size() > 0 || i.size() == 0);
   std::u16string s16{unicode::convert<unicode::UTF_16, unicode::UTF_16>(i)};
   BOOST_CHECK(s16.size() == i.size());
-  std::u8string s8{unicode::convert<unicode::UTF_16, unicode::UTF_8>(i)};
+  std::basic_string<utf8_t> s8{unicode::convert<unicode::UTF_16, unicode::UTF_8>(i)};
   BOOST_CHECK(s8.size() >= i.size());
  }
 
@@ -398,7 +398,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid)
   BOOST_CHECK(s32.size() > 0 || i.size() == 0);
   std::u16string s16{unicode::convert<unicode::UTF_8, unicode::UTF_16>(i)};
   BOOST_CHECK(s16.size() > 0 || i.size() == 0);
-  std::u8string s8{unicode::convert<unicode::UTF_8, unicode::UTF_8>(i)};
+  std::basic_string<utf8_t> s8{unicode::convert<unicode::UTF_8, unicode::UTF_8>(i)};
   BOOST_CHECK(s8.size() == i.size());
  }
 
@@ -424,7 +424,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid)
   // Performance test UTF-32 -> UTF-8
   auto t0{std::chrono::steady_clock::now()};
   for (const auto& i : u32list) {
-   std::u8string s{unicode::convert<unicode::UTF_32, unicode::UTF_8>(i)};
+   std::basic_string<utf8_t> s{unicode::convert<unicode::UTF_32, unicode::UTF_8>(i)};
   }
   std::cout << "Performance test for converting 1M strings from UTF-32 to UTF-8: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl;
  }
@@ -451,7 +451,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid)
   // Performance test UTF-16 -> UTF-8
   auto t0{std::chrono::steady_clock::now()};
   for (const auto& i : u16list) {
-   std::u8string s{unicode::convert<unicode::UTF_16, unicode::UTF_8>(i)};
+   std::basic_string<utf8_t> s{unicode::convert<unicode::UTF_16, unicode::UTF_8>(i)};
   }
   std::cout << "Performance test for converting 1M strings from UTF-16 to UTF-8: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl;
  }
@@ -478,7 +478,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid)
   // Performance test UTF-8 -> UTF-8
   auto t0{std::chrono::steady_clock::now()};
   for (const auto& i : u8list) {
-   std::u8string s{unicode::convert<unicode::UTF_8, unicode::UTF_8>(i)};
+   std::basic_string<utf8_t> s{unicode::convert<unicode::UTF_8, unicode::UTF_8>(i)};
   }
   std::cout << "Performance test for converting 1M strings from UTF-8 to UTF-8: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl;
  }
author	Roland Reichwein <mail@reichwein.it>	2021-12-19 19:51:38 +0100
committer	Roland Reichwein <mail@reichwein.it>	2021-12-19 19:51:38 +0100
commit	9dc97269201603dd479e15a736a64479a5095556 (patch)
tree	b5f215bf9cfbbf2bee092505f4fdfbf3e4501b7b
parent	e24a0d5d371d0916dbfb375d3ea404f7e6237c74 (diff)