Added support for char and wchar_t

author: Roland Reichwein <mail@reichwein.it> 2021-02-12 18:12:51 +0100
committer: Roland Reichwein <mail@reichwein.it> 2021-02-12 18:12:51 +0100
commit: 24ec1d5ba85503599fd301aa8cd56ee65651ab0b (patch)
tree: ccabf3b89338825720e926a73602862df03ae801
parent: b47110d30db3a416775c5de88e1d946dfdbda734 (diff)
4 files changed, 89 insertions, 51 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..01c9c5b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+*.a
+*.o
+*.d
+*.pem
+*.so
+*.swp
+default.profraw
diff --git a/debian/changelog b/debian/changelog
index 231944b..490318e 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,6 +1,7 @@
 unicode (1.1) unstable; urgency=medium
 
   * Fixed copyright
+  * Support Unicode conversion for basic types like char and wchar_t
 
  -- Roland Reichwein <mail@reichwein.it>  Fri, 05 Feb 2021 21:53:32 +0100
 
diff --git a/include/unicode.h b/include/unicode.h
index 2424fb1..d6f8e51 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -39,6 +39,8 @@ namespace unicode::detail {
  template<typename T>
  struct utf_iterator
  {
+  static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
+
   typedef T input_type;
   typedef char32_t value_type;
   typedef char32_t& reference;
@@ -50,7 +52,7 @@ namespace unicode::detail {
   utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend):
    iterator(cbegin), end_iterator(cend)
   {
-   calculate_value<T>();
+   calculate_value();
   }
 
   utf_iterator<T>(const utf_iterator<T>& other) = default;
@@ -67,27 +69,6 @@ namespace unicode::detail {
    return *(iterator + index);
   }
 
-  // set value member
-  // default: char32_t for UTF-32
-  // specializations for UTF-8 and UTF-16 below
-  template<typename T1>
-  void calculate_value()
-  {
-   static_assert(sizeof(T1) == 4);
-
-   size_t remaining{remaining_code_units()};
-
-   if (!remaining)
-    return;
-
-   value = get_code_unit<0>();
-   
-   if (!unicode::is_valid_unicode(value))
-    throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
-
-   sequence_length = 1;
-  }
-
   inline static bool is_continuation_byte(T b)
   {
    return (b & 0b11000000) == 0b10000000;
@@ -122,10 +103,7 @@ namespace unicode::detail {
    return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6);
   }
 
-  // GCC Bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85282
-  // specialization for UTF-8
-  template<>
-  void calculate_value<utf8_t>()
+  void calculate_value_utf8()
   {
    size_t remaining{remaining_code_units()};
    
@@ -168,9 +146,7 @@ namespace unicode::detail {
    }
   }
 
-  // specialization for UTF-16
-  template<>
-  void calculate_value<char16_t>()
+  void calculate_value_utf16()
   {
    size_t remaining{remaining_code_units()};
    
@@ -195,11 +171,40 @@ namespace unicode::detail {
    }
   }
 
+  void calculate_value_utf32()
+  {
+   size_t remaining{remaining_code_units()};
+
+   if (!remaining)
+    return;
+
+   value = get_code_unit<0>();
+   
+   if (!unicode::is_valid_unicode(value))
+    throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
+
+   sequence_length = 1;
+  }
+
+  // set value member
+  void calculate_value()
+  {
+   if constexpr(sizeof(T) == 1) {
+    calculate_value_utf8();
+   } else if constexpr (sizeof(T) == 2) {
+    calculate_value_utf16();
+   } else if constexpr (sizeof(T) == 4) {
+    calculate_value_utf32();
+   } else {
+    throw std::runtime_error("Invalid character size: "s + std::to_string(sizeof(T)));
+   }
+  }
+
   // pre-increment
   utf_iterator<T>& operator++()
   {
    iterator += sequence_length;
-   calculate_value<T>();
+   calculate_value();
    return *this;
   }
 
@@ -224,6 +229,8 @@ namespace unicode::detail {
  template<typename T>
  struct utf_back_insert_iterator
  {
+  static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
+
   typedef T value_type;
   typedef std::basic_string<T> string_type;
   typedef utf_back_insert_iterator& reference;
@@ -253,16 +260,6 @@ namespace unicode::detail {
    return *this;
   }
 
-  // default: utf-32 code unit for UTF-32
-  // specializations for UTF-8 and UTF-16 below
-  template<typename T1=T>
-  reference operator=(const char32_t& value)
-  {
-   // expect value to be already valid Unicode values
-   s.push_back(value);
-   return *this;
-  }
-
   // n is number of UTF-8 bytes in sequence
   template<size_t n>
   inline static T byte0_of(char32_t value)
@@ -288,10 +285,7 @@ namespace unicode::detail {
     return trailing_byte<m - n - 1>(value);
   }
 
-  // specialization for UTF-8
-  // append utf-8 byte sequence
-  template<>
-  reference operator=<utf8_t>(const char32_t& value)
+  void append_utf8(const char32_t& value)
   {
    if (value < 0x80) { // 1 byte
     s.push_back(static_cast<value_type>(value));
@@ -309,13 +303,9 @@ namespace unicode::detail {
     s.push_back(byte_n_of_m<3,4>(value));
    } else
     throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast<uint32_t>(value)));
-   return *this;
   }
 
-  // specialization for UTF-16
-  // append utf-16 word sequence
-  template<>
-  reference operator=<char16_t>(const char32_t& value)
+  void append_utf16(const char32_t& value)
   {
    if (value <= 0xFFFF) { // expect value to be already valid Unicode values
     s.push_back(static_cast<value_type>(value));
@@ -324,6 +314,25 @@ namespace unicode::detail {
     s.push_back((value_reduced >> 10) + 0xD800);
     s.push_back((value_reduced & 0x3FF) + 0xDC00);
    }
+  }
+
+  void append_utf32(const char32_t& value)
+  {
+   // expect value to be already valid Unicode values
+   s.push_back(value);
+  }
+
+  reference operator=(const char32_t& value)
+  {
+   if constexpr(sizeof(T) == 1) {
+    append_utf8(value);
+   } else if constexpr(sizeof(T) == 2) {
+    append_utf16(value);
+   } else if constexpr(sizeof(T) == 4) {
+    append_utf32(value);
+   } else {
+    throw std::runtime_error("Invalid type size: "s + std::to_string(sizeof(T)));
+   }
    return *this;
   }
 
@@ -555,9 +564,11 @@ namespace unicode {
  template<typename From, typename To>
  std::basic_string<To> convert(const std::basic_string<From>& s)
  {
+  typedef UTF<utf_iterator<From>, utf_back_insert_iterator<To>> UTF_Trait;
+  
   std::basic_string<To> result;
 
-  std::copy(Encoding<From>::Facet::begin(s), Encoding<From>::Facet::end(s), Encoding<To>::Facet::back_inserter(result));
+  std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result));
 
   return result;
  }
@@ -566,8 +577,10 @@ namespace unicode {
  template<typename T>
  bool is_valid_utf(const std::basic_string<T>& s)
  {
+  typedef UTF<utf_iterator<T>, utf_back_insert_iterator<T>> UTF_Trait;
+  
   try {
-   std::for_each(Encoding<T>::Facet::begin(s), Encoding<T>::Facet::end(s), [](const char32_t& c){});
+   std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){});
   } catch (const std::invalid_argument&) {
    return false;
   }
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index 692dfac..99a8f99 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -334,6 +334,23 @@ BOOST_AUTO_TEST_CASE(convert)
  
  BOOST_CHECK((unicode::convert<utf8_t,char16_t>("abc")) == std::u16string{u"abc"});
  BOOST_CHECK((unicode::convert<char32_t,char16_t>(U"abc")) == std::u16string{u"abc"});
+
+ BOOST_CHECK((unicode::convert<char, char32_t>(u8"äöü")) == std::u32string{U"äöü"});
+
+#ifdef _WIN32
+ BOOST_CHECK(sizeof(wchar_t) == 2);
+#else // Unix like
+ BOOST_CHECK(sizeof(wchar_t) == 4);
+#endif
+
+ // For the following checks, wchar_t size and encoding is system dependent:
+ // Windows: UTF-16
+ // Linux: UTF-32
+ BOOST_CHECK((unicode::convert<char, wchar_t>(u8"äöü")) == std::wstring{L"äöü"});
+ BOOST_CHECK((unicode::convert<char, wchar_t>(u8"\u732b")) == std::wstring{L"\u732b"});
+ BOOST_CHECK((unicode::convert<char, wchar_t>(u8"\U0001F63A")) == std::wstring{L"\U0001F63A"});
+ BOOST_CHECK((unicode::convert<wchar_t, char32_t>(L"\U0001F63A")) == std::u32string{U"\U0001F63A"});
+ BOOST_CHECK((unicode::convert<wchar_t, char>(L"\U0001F63A")) == std::string{u8"\U0001F63A"});
 }
 
 BOOST_AUTO_TEST_CASE(is_valid_utf)
author	Roland Reichwein <mail@reichwein.it>	2021-02-12 18:12:51 +0100
committer	Roland Reichwein <mail@reichwein.it>	2021-02-12 18:12:51 +0100
commit	24ec1d5ba85503599fd301aa8cd56ee65651ab0b (patch)
tree	ccabf3b89338825720e926a73602862df03ae801
parent	b47110d30db3a416775c5de88e1d946dfdbda734 (diff)