4 files changed, 35 insertions, 112 deletions
diff --git a/include/unicode.h b/include/unicode.h
index eb872ec..2bf17f4 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -214,7 +214,6 @@ namespace unicode {
  template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<is_utf_8_v<From>, bool> = true>
  inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu)
  {
-#if 1
   if (block_mode && bytes_in_accu == 8 && (accu & 0x8080808080808080) == 0) {
    result.append({
                  static_cast<To>(accu & 0x7F),
@@ -229,7 +228,6 @@ namespace unicode {
    accu = 0;
    bytes_in_accu = 0;
   } else
-#endif
   if ((accu & 0x80) == 0) { // 1 byte sequence
    append_utf<7>(result, static_cast<char32_t>(accu & 0x7F));
    accu >>= 8;
@@ -262,116 +260,39 @@ namespace unicode {
    throw std::invalid_argument("Invalid UTF-8 byte sequence");
  }
 
- // Little Endian optimized version for UTF-16
- // In block_mode, at least 4 bytes are in accu. On first call, even 8.
- // otherwise, at least one code unit is in accu
- template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<is_utf_16_v<From>, bool> = true>
- inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu)
- {
-#if 1
-  if ((accu & 0xFF80FF80FF80FF80) == 0) {
-   auto number_of_values{bytes_in_accu / sizeof(From)};
-   result.resize(result.size() + number_of_values);
-   for (int i = 0; i < number_of_values; i++) {
-    result[result.size() - number_of_values + i] = static_cast<To>(accu & 0x7F);
-    accu >>= 16;
-   }
-   bytes_in_accu = 0;
-  } else
-#endif
-  if ((accu & 0xFC00FC00FC00FC00) == 0xDC00D800DC00D800) {
-   // found 4 code units forming 3 code points in UTF-16;
-   // by definition of UTF-16, we have valid unicode values at this point
-   if constexpr(is_utf_32_v<To>) {
-    //result.resize(result.size() + 2);
-    //*reinterpret_cast<uint64_t*>(&result[result.size() - 2]) = (((accu & 0x03FF000003FF) << 10) | ((accu >> 16) & 0x03FF000003FF)) + 0x0001000000010000;
-    result.append({
-                  static_cast<To>(((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000),
-                  static_cast<To>(((accu & 0x03FF00000000) >> 22 | ((accu >> 48) & 0x03FF)) + 0x10000)
-                  });
-   } else {
-    append_utf(result, ((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000);
-    append_utf(result, ((accu & 0x03FF00000000) >> 22 | ((accu >> 48) & 0x03FF)) + 0x10000);
-   }
-   accu = 0;
-   bytes_in_accu = 0;
-  } else
-  if (From unit0 {static_cast<From>(accu & 0xFFFF)}; is_valid_unicode<16>(unit0)) {
-   append_utf<16>(result, unit0);
-   accu >>= 16;
-   bytes_in_accu -= 2;
-  } else
-  if ((accu & 0xFC00FC00) == 0xDC00D800) {
-   // found 2 code units forming 1 code point in UTF-16;
-   // by definition of UTF-16, we have a valid unicode value at this point
-   append_utf(result, ((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000);
-   accu >>= 32;
-   bytes_in_accu -= 4;
-  } else
-   throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing");
- }
-
  // Little Endian optimized version
  template<typename From, typename To, std::enable_if_t<is_encoding_v<From> && is_encoding_v<To>, bool> = true>
  typename To::string_type convert_optimized_utf(const typename From::string_type& s)
  {
   typename To::string_type result;
-  if constexpr(is_utf_32_v<typename From::value_type>) {
-   for (const auto value: s) {
-    if (is_valid_unicode(value))
-     append_utf(result, value);
-    else
-     throw std::invalid_argument("Invalid Unicode character in UTF-32");
+  uint64_t accu{};
+  int bytes_in_accu{};
+
+  size_t s_index{};
+  size_t s_size{s.size()};
+  while (s_index + 8 / sizeof(typename From::value_type) <= s_size) {
+   // read input
+   // assume: bytes_in_accu < 8
+   accu |= (*reinterpret_cast<const uint64_t*>(&(s.data()[s_index]))) << (bytes_in_accu * 8);
+   s_index += (8 - bytes_in_accu) / sizeof(typename From::value_type);
+   bytes_in_accu = 8;
+
+   while (bytes_in_accu >= 4) {
+    append_accu<typename From::value_type, typename To::value_type, true>(result, accu, bytes_in_accu);
    }
-#if 0
-  } else if constexpr(is_utf_16_v<typename From::value_type>) {
-   for (int i = 0; i < s.size(); i++) {
-    typename From::value_type unit0{s[i]};
-    if (is_valid_unicode(unit0)) {
-     append_utf(result, unit0);
-    } else {
-     i++;
-     if (i < s.size()) {
-      typename From::value_type unit1 {s[i]};
-      if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00)
-       throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates");
-
-      append_utf(result, (static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000);
-     } else
-      throw std::invalid_argument("Invalid code unit at end of UTF-16 string");
-    }
-   }
-#endif
-  } else {
-   uint64_t accu{};
-   int bytes_in_accu{};
-
-   size_t s_index{};
-   size_t s_size{s.size()};
-   while (s_index + 8 / sizeof(typename From::value_type) <= s_size) {
-    // read input
-    // assume: bytes_in_accu < 8
-    accu |= (*reinterpret_cast<const uint64_t*>(&(s.data()[s_index]))) << (bytes_in_accu * 8);
-    s_index += (8 - bytes_in_accu) / sizeof(typename From::value_type);
-    bytes_in_accu = 8;
-
-    while (bytes_in_accu >= 4) {
-     append_accu<typename From::value_type, typename To::value_type, true>(result, accu, bytes_in_accu);
-    }
-   }
-
-   // 0..3 bytes left in accu
-   // 0..7 bytes left in s
+  }
 
-   while (s_index < s_size || bytes_in_accu > 0) {
-    while (s_index < s_size && bytes_in_accu < 8) {
-     accu |= static_cast<uint64_t>(*reinterpret_cast<const typename From::value_type*>(&(s.data()[s_index]))) << (bytes_in_accu * 8);
-     ++s_index;
-     bytes_in_accu += sizeof(typename From::value_type);
-    }
+  // 0..3 bytes left in accu
+  // 0..7 bytes left in s
 
-    append_accu<typename From::value_type, typename To::value_type, false>(result, accu, bytes_in_accu);
+  while (s_index < s_size || bytes_in_accu > 0) {
+   while (s_index < s_size && bytes_in_accu < 8) {
+    accu |= static_cast<uint64_t>(*reinterpret_cast<const typename From::value_type*>(&(s.data()[s_index]))) << (bytes_in_accu * 8);
+    ++s_index;
+    bytes_in_accu += sizeof(typename From::value_type);
    }
+
+   append_accu<typename From::value_type, typename To::value_type, false>(result, accu, bytes_in_accu);
   }
   return result;
  }
@@ -408,7 +329,7 @@ namespace unicode {
  ToContainer convert(const FromContainer& s)
  {
   typedef UTF<utf_iterator<From>, utf_back_insert_iterator<To>> UTF_Trait;
-  
+
   ToContainer result;
 
   std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result));
diff --git a/include/unicode/type_traits.h b/include/unicode/type_traits.h
index c3507e7..63c7d69 100644
--- a/include/unicode/type_traits.h
+++ b/include/unicode/type_traits.h
@@ -50,7 +50,7 @@ namespace unicode {
  template<typename T>
  struct is_utf_8
  {
-  static const bool value{std::is_trivial_v<T> && sizeof(T) == 1};
+  static const bool value{std::is_same_v<T, UTF_8> || (std::is_trivial_v<T> && sizeof(T) == 1)};
  };
  
  template<typename T>
@@ -59,7 +59,7 @@ namespace unicode {
  template<typename T>
  struct is_utf_16
  {
-  static const bool value{std::is_trivial_v<T> && sizeof(T) == 2};
+  static const bool value{std::is_same_v<T, UTF_16> || (std::is_trivial_v<T> && sizeof(T) == 2)};
  };
  
  template<typename T>
@@ -68,7 +68,7 @@ namespace unicode {
  template<typename T>
  struct is_utf_32
  {
-  static const bool value{std::is_trivial_v<T> && sizeof(T) == 4};
+  static const bool value{std::is_same_v<T, UTF_32> || (std::is_trivial_v<T> && sizeof(T) == 4)};
  };
  
  template<typename T>
diff --git a/include/unicode/utf.h b/include/unicode/utf.h
index 81e8f2b..046d9c6 100644
--- a/include/unicode/utf.h
+++ b/include/unicode/utf.h
@@ -415,11 +415,6 @@ namespace unicode {
   }
  };
 
- // Encoding for convert()
- typedef UTF<utf_iterator<utf8_t>, utf_back_insert_iterator<utf8_t>> UTF_8;
- typedef UTF<utf_iterator<char16_t>, utf_back_insert_iterator<char16_t>> UTF_16;
- typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32;
-
  // Helper to get correct Encoding from char type, e.g. Encoding<typename decltype(s)::value_type>::type or Encoding_t<typename decltype(s)::value_type>
  template<typename T>
  struct Encoding
diff --git a/include/unicode/utf_fwd.h b/include/unicode/utf_fwd.h
index f3f6c52..c42dea1 100644
--- a/include/unicode/utf_fwd.h
+++ b/include/unicode/utf_fwd.h
@@ -2,6 +2,8 @@
 
 // Forward declarations
 
+#include "types.h"
+
 #include <string>
 
 namespace unicode::detail {
@@ -19,5 +21,10 @@ namespace unicode {
  template<typename InputIt, typename OutputIt>
  struct UTF;
 
+ // Encoding for convert()
+ typedef UTF<utf_iterator<utf8_t>, utf_back_insert_iterator<utf8_t>> UTF_8;
+ typedef UTF<utf_iterator<char16_t>, utf_back_insert_iterator<char16_t>> UTF_16;
+ typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32;
+
 } // namespace unicode